diff --git a/.claude/skills/review-pr/SKILL.md b/.claude/skills/review-pr/SKILL.md new file mode 100644 index 00000000..f26f22a4 --- /dev/null +++ b/.claude/skills/review-pr/SKILL.md @@ -0,0 +1,340 @@ +--- +name: review-pr +description: Review a GitHub pull request against the go-questdb-client coding standards +argument-hint: [PR number or URL] [--level=0..3] +allowed-tools: Bash(gh *), Read, Grep, Glob, Agent +--- + +Review the pull request `$ARGUMENTS`. + +## Review mindset + +You are a senior QuestDB engineer performing a blocking code review. `go-questdb-client` is mission-critical software — bugs can cause data loss, silent data corruption, dropped rows, or crashes (a panic in a background goroutine takes down the *host* application, not just the client) in customer Go services across HTTP/TCP ILP and the QWP columnar protocol. There is zero tolerance for correctness issues, goroutine/connection leaks, data races, or wire-format errors. Be critical, thorough, and opinionated. Your job is to catch problems before they ship, not to be nice. + +- **Assume nothing is correct until you've verified it.** Read surrounding code to understand context — don't just look at the diff in isolation. +- **The diff is a hint, not the boundary of the review.** The highest-value bugs almost always live at callsites outside the diff that depend on a contract the diff quietly changed. Treat the diff as the entry point, not the scope. +- **Flag every issue you find**, no matter how small. Do not soften language or hedge. Say "this is wrong" not "this might be an issue". +- **Do not praise the code.** Skip "looks good", "nice work", "clever approach". Focus entirely on problems and risks. +- **Think adversarially.** For each change, work through: + - Inputs: which values break this? Empty buffers, zero-length strings, boundary integers, max-length symbols, names containing the disallowed-character set, NaN/Inf floats, nil slices/maps, zero-value timestamps. + - Encoding: how does the code behave with invalid UTF-8, embedded NUL bytes, oversized lengths, or a string that needs escaping in ILP vs QWP framing? + - Concurrency: what happens under concurrent calls to the same sender, an auto-flush firing during a fluent call, the QWP send-loop goroutine racing the producer, a context cancelled mid-flush, `Close()` racing an in-flight flush? + - Failure modes: connection dropping mid-flush, partial write, TLS handshake failure, auth rejection, server-side QWP rejection (`*SenderError`), reconnect + replay from `engineAckedFsn()+1`, HALT latching, disk-backed segment-file (`sf_dir`) I/O errors. + - Callers: what happens when a caller ignores the returned `error`, reuses a sender after a latched/HALT error instead of rebuilding it, type-asserts `LineSender` to `QwpSender` when the transport is HTTP, or shares one sender across goroutines without synchronization? +- **Check what's missing**, not just what's there. Missing tests, missing error handling, missing edge cases, missing doc comments on exported API changes, a new ILP column type that didn't update all six concrete structs or the `export_test.go` switch helpers, a new config key not added to `conf_parse.go`. +- **Verify every claim.** If the PR title says "fix", verify the bug existed and the fix is correct. If it says "improve performance", look for a benchmark or reason about the algorithmic change — and check `BenchmarkQwpSenderSteadyState` still holds 0 allocs/op. If it says "simplify", verify the new code is actually simpler and drops no behavior. Treat the PR description as an unverified hypothesis. +- **Read the full context of changed files** when the diff alone is ambiguous. Use Read/Grep/Glob to inspect surrounding code, callers, and related tests. +- **Assess reachability before reporting.** For every potential bug, trace the actual callers and inputs. If a problem requires physically impossible conditions (a buffer larger than `math.MaxInt`, a NUL injected through an API that already rejects it via name validation, a panic behind a validation guard that all callers pass through), it is not a real finding — drop it. Focus on bugs real workloads trigger, not theoretical edge cases. +- **Panics that guard library-internal invariants are valid.** A `panic` on a "this should never happen given our own invariants" condition is the preferred mechanism for library-internal bugs. Do NOT flag it as insufficient. Only flag a `panic` (or unchecked slice index, nil-map write, or `, ok`-less type assertion) if a caller honoring the documented contract — including the disallowed-character rules documented on each `LineSender` method — can plausibly trigger it. **The fluent-API error-latching convention is intentional, not a missing-return bug:** `Table` / `Symbol` / `*Column` deliberately keep returning the sender and surface the latched error on the next `At` / `AtNow` / `Flush`. Do not flag a method for "swallowing" an error if it latches per this convention; *do* flag it if it latches in a way that loses the error or fails to surface it on the next terminal call. + +## Review level + +Parse `$ARGUMENTS` for a level token: `--level=N`, `-lN`, or a bare single digit `0`-`3`. **If no level is given, default to 0.** Strip the level token before feeding the remainder (PR number or URL) to `gh` commands. + +The level controls how much of the review below actually runs. Lower levels keep the same review *spirit* — adversarial, blocking, no praise — but cut the breadth of the analysis. Higher levels have significantly higher token cost; reserve level 3 for high-stakes PRs: QWP wire format / cursor engine / send loop (`qwp_wire.go`, `qwp_encoder.go`, `qwp_sf_*.go`), ILP wire format (`buffer.go`, any V1/V2/V3 change), the `LineSender` interface or the six `{http,tcp}LineSender{,V2,V3}` structs or the `export_test.go` switch helpers, authentication/TLS, sender/buffer state-machine changes, the conf parser (`conf_parse.go`), or any change to goroutine lifecycle / channel protocols / mutex ordering. + +| Level | What runs | +|-------|-----------| +| **0 (default)** | Steps 1, 2, 4. Skip Step 2.5. Skip Step 3 — no agent spawn; review the diff inline in the main loop, using Read/Grep on demand to resolve ambiguities. Skip Step 3b — verify each finding inline as you write it. Single-pass review covering correctness, panic/crash surface, concurrency, tests, and coding standards on the diff itself. | +| **1** | Adds Step 2.5a (semantic delta only — skip 2.5b/2.5c/2.5d). In Step 3, launch only Agent 1 (correctness), Agent 2 (panic/crash surface), and Agent 7 (tests) in parallel. Skip all other agents. Skip Step 3b — verify findings inline as you draft the report. | +| **2** | Full Step 2.5, but in 2.5b restrict the callsite inventory to exported symbols plus everything re-exported through `export_test.go`. In Step 3, launch Agents 1-8. Skip Agent 9 (cross-context) and Agent 10 (adversarial fresh-context). Step 3b uses a single batched verification agent for all findings instead of one per finding. | +| **3** | Every step below as written, all 10 agents, per-finding verification. The full mission-critical pass. | + +State the chosen level in one line at the start of the review so the user knows what they're getting (e.g., "Reviewing PR #141 at level 2"). If the level was defaulted, mention that level 3 exists for the full review. + +## Step 1: Gather PR context + +Capture the PR identifier in `$PR` (the part of `$ARGUMENTS` left after stripping the level token), then fetch metadata, diff, and review comments in a single bash call so `$PR` is in scope for all three `gh` invocations: + +```bash +PR='' +gh pr view "$PR" --json number,title,body,labels,state +gh pr diff "$PR" +gh pr view "$PR" --comments +``` + +## Step 2: PR title and description + +Check: +- Title is clear and describes the change +- Description speaks to end-user impact, not implementation internals +- If fixing an issue, `Fixes #NNN` or a link to the issue is present +- Tone is level-headed and analytical +- For public API changes (the `LineSender` / `QwpSender` interfaces, exported `With*` options, a new or renamed config key, a new ILP column type, an `*_integration_test.go` behavior change visible to users), the description calls out the API/behavior change explicitly + +## Step 2.5: Map the change surface + +Before launching review agents, produce a structured change surface map. This step is mandatory and must use Grep/Glob — do not reason about callsites from memory. The output of this step is required input for every agent in Step 3. + +### 2.5a Semantic delta per changed symbol + +For every modified or added function, method, interface method, struct field, or exported constant/var, write: + +- **Symbol:** fully-qualified name (e.g., `(*qwpLineSender).Flush`, `httpLineSenderV2.column`, `LineSenderFromConf`) +- **Before:** signature, return type, error behavior (returned `error` vs latched `*SenderError` vs HALT), panic behavior, receiver mutation (which fields mutated; pointer vs value receiver), ordering/idempotency/replay guarantees, allocation behavior (hot path vs setup path), goroutine/channel interaction, context handling, lock acquisition +- **After:** same fields +- **Delta:** one line stating what semantically changed + +"Refactored", "cleaned up", "improved", "simplified" are not acceptable deltas. State the actual behavioral difference. If nothing semantically changed, write "no behavioral change" — but only after checking, not as a default. + +### 2.5b Callsite inventory + +For every changed symbol that is exported, re-exported via `export_test.go`, an interface method on `LineSender`/`QwpSender`, a config key, or part of the ILP/QWP wire encoders, run Grep across the entire repository to find every callsite, implementation, or reference outside the diff. + +Produce a list grouped by file. The repository is a flat `package questdb` at the root (`*.go`), plus `examples/`, `bench/`, and `test/`. Search at minimum: + +- **Production + test callers (root package):** `grep -rn 'SymbolName' *.go` +- **Interface implementations:** every changed `LineSender`/`QwpSender` method must be checked against *all* implementations — the six ILP structs `httpLineSender{,V2,V3}` (`http_sender.go`), `tcpLineSender{,V2,V3}` (`tcp_sender.go`), and the QWP `qwpLineSender` (`qwp_sender.go`) +- **The six-struct + switch-helper invariant:** for a new/changed ILP column type or buffer behavior, `grep -n` the `Messages` / `MsgCount` / `BufLen` / `ProtocolVersion` switches in `export_test.go` and confirm they stay exhaustive over all six structs +- **Config keys:** `grep -n 'keyname' conf_parse.go` — `conf_parse.go` is the single source of truth for supported keys +- **Black-box test surface:** `grep -rn 'SymbolName' export_test.go` (re-exports into `package questdb` for `questdb_test`) +- **Examples and benchmarks (questdb.io renders these):** `grep -rn 'SymbolName' examples/ bench/` +- **Interop conformance:** `grep -rn 'SymbolName' interop_test.go test/interop/` + +A changed exported / interface / config-key symbol with zero recorded Grep calls in the trace is a skill violation. The model is not allowed to assert "this is only used here" without showing the search. + +### 2.5c Implicit contract list + +For each changed symbol, walk this checklist and write one line per item, stating before vs after: + +- Panics on which inputs, and whether the panic site runs on a background goroutine (QWP send loop `qwpSfSendLoop`, background drainers, auto-flush) — a panic there crashes the host process with no caller `recover` +- Which `error` values / `*SenderError` categories are returned, and which call-chains propagate vs swallow them; whether the error latches per the fluent-API convention vs surfaces immediately +- Flush ordering, idempotency, replay safety — for QWP, whether cursor frames remain self-sufficient (full schema + full symbol dictionary from id 0 every flush) so reconnect/replay from `engineAckedFsn()+1` and orphan adoption stay safe +- Re-entrancy: calling `Flush`/`Close` from inside a `WithErrorHandler` callback; auto-flush firing mid-fluent-call +- Lock acquisition order and which mutexes are held on return; which channels are read/written and who owns closing them; goroutine spawn/join/leak on every path including error returns +- Context cancellation / deadline propagation (the `ctx` threaded through `NewLineSender`, `Flush`, `engineAppendBlocking`) +- Allocation on the hot path (`Table`→`Symbol`→`*Column`→`At` build path, flush, QWP encode) vs setup path (construction, conf parsing) — the hot path is pinned at 0 allocs/op +- Buffer state on error: does a failed call leave the buffer half-written? Does the sender require close+rebuild after a HALT (matches Java; the sender does not auto-resume)? +- Error-policy resolution precedence (highest first): `WithErrorPolicyResolver` → `WithErrorPolicy(category, …)` → connect-string `on_*_error` → `on_server_error` → spec defaults; `PROTOCOL_VIOLATION` and `UNKNOWN` are never user-configurable (always HALT) +- Wire format: any change to the ILP bytes produced (per protocol version V1/V2/V3) or the QWP frame structure/codec accepted by the server +- `LineSenderPool` is HTTP-only by design — does the change wrongly let a TCP/QWP config through, or break `errHttpOnlySender`? + +### 2.5d Cross-context exposure list + +End this step with an explicit list of "places this change is visible from but the diff does not touch". This is the highest-priority input for the bug-hunting agents in Step 3. + +Group the callsites from 2.5b by execution context. Typical contexts in this codebase: + +- **`LineSender` interface surface:** all six ILP structs + `qwpLineSender` — any interface-method change must be correct in all seven +- **`QwpSender` superset:** code that type-asserts `LineSender` to `QwpSender` for QWP-only column types +- **Buffer build hot path:** `Table`, `Symbol`, the `*Column` methods, `At`/`AtNow` and their callers (0-alloc pinned) +- **Flush path:** `Flush`, `FlushAndGetSequence`, `AwaitAckedFsn` +- **Auto-flush path:** the non-blocking `enqueueCursor` path and whatever triggers it +- **QWP cursor engine + send loop:** `qwpSfCursorEngine`, `engineAppendBlocking`, `qwpSfSendLoop`, reconnect/replay, ACK parsing, `engineAckedFsn`/`enginePublishedFsn` (`qwp_sf_*.go`) +- **Background drainer goroutines:** orphan-slot adoption (`qwp_sf_orphan.go`, `qwp_sf_drainer.go`, `qwp_sf_round_walk.go`), visible via `QwpSender.BackgroundDrainers()` +- **Disk-backed segments:** `sf_dir` set → `//*.sfa` (the per-sender directory is itself the slot), on-disk-compatible with the Java client's `MmapSegment.java` +- **Configuration parsing:** `LineSenderFromConf`, `conf_parse.go` +- **Authentication / TLS:** TLS config, basic/token auth on HTTP/TCP, QWP handshake +- **Error callback:** `WithErrorHandler` async path, plus producer-side `errors.As` after `Flush`/`FlushAndGetSequence` +- **Connection pool:** `sender_pool.go` (`LineSenderPool`), HTTP-only +- **Examples & benchmarks:** `examples/{from-conf,http,qwp,tcp}`, `bench/` — referenced by `examples.manifest.yaml` +- **Interop conformance:** `interop_test.go` + the `test/interop/questdb-client-test` submodule (ILP vectors shared across QuestDB clients) + +Every entry on this list must be reviewed in Step 3. + +## Step 3: Parallel review + +Every agent receives: +1. The PR diff +2. The full change surface map from Step 2.5 (semantic deltas, callsite inventory, implicit contracts, cross-context exposure list) + +### Anti-anchoring directive (applies to all agents) + +- **Bugs at callsites outside the diff outrank bugs inside the diff.** A confirmed bug in a file the PR did not touch but that calls a changed symbol is a P0 finding. +- **"Looks correct in isolation" is not a valid conclusion.** Before clearing a changed symbol, the agent must walk the callsite inventory from 2.5b and explicitly state, per callsite, whether the new behavior is still correct there. +- **The diff is the entry point, not the scope.** If the change surface map shows the symbol is reachable from N other files, the review covers N+1 files. +- A single finding of the form "in `tcp_sender.go` the new behavior of `buffer.column` causes Y in `tcpLineSenderV3`" is worth more than five findings inside the diff. + +### Agents + +Launch the following agents in parallel. + +**Agent 1 — Correctness & bugs:** nil handling at API boundaries, edge cases, logic errors, off-by-one, operator precedence, error paths, integer overflow/truncation (buffer length math, FSN/sequence arithmetic, varint/length-prefix encoding), wrong wire bytes. Verify ILP encoding per protocol version (V1 text-only, V2 binary float64 + n-dim float arrays, V3 decimals) and QWP frame/codec correctness. Cross-reference every changed symbol against its callsite inventory and verify the new behavior is correct at each callsite. + +**Agent 2 — Panic & crash surface:** A panic on a background goroutine aborts the host process with no recovery. Flag every reachable instance of: + +- **Panic sources:** nil pointer / nil receiver dereference, slice/array index or slice expression out of bounds, write to a nil map, `, ok`-less type assertion (especially `LineSender`→`QwpSender`), integer divide-by-zero, `make` with a negative or untrusted-huge size, string→int conversions assumed infallible. +- **Channel misuse:** send on a closed channel, close of a closed/nil channel, close from the wrong side, double close — especially around `qwpSfSendLoop`, drainers, and shutdown/`Close()`. +- **Goroutine-crash propagation:** a panic in `qwpSfSendLoop`, a background drainer, an auto-flush goroutine, or any goroutine spawned by the client crashes the *whole application*. This is the Go analog of "a panic crossing the FFI boundary" — there is no caller-side `recover`. Verify such goroutines either cannot panic on contract-honoring input or have a deliberate top-level `recover` that converts the panic into a latched error / error-handler call. +- **Panic-in-`defer` during unwind:** a `panic` inside a deferred function while another panic is in flight is unrecoverable. Flag deferred functions that can panic (index, nil-map write, failed type assertion). +- **`unsafe` / unaligned access:** any use of `unsafe`, `reflect`, or pointer arithmetic — verify alignment, lifetime, and that no Go pointer escapes its backing array. +- **Resource-exhaustion crash:** an allocation, slice grow, or `make` sized by an untrusted length parameter (e.g., a server-supplied frame length) — validate the bound before allocating. +- **Unbounded recursion / stack overflow** on attacker- or server-controlled depth (decoders, nested arrays). + +Every fallible operation must return `error`, not swallow it. Every client-spawned goroutine must have a defined crash story. + +**Agent 3 — Public API & interface conformance:** Verify every changed `LineSender`/`QwpSender` method is implemented correctly and consistently across **all seven implementations** (`httpLineSender{,V2,V3}`, `tcpLineSender{,V2,V3}`, `qwpLineSender`). For a new/changed ILP column type or buffer behavior, verify all six concrete structs *and* the `Messages`/`MsgCount`/`BufLen`/`ProtocolVersion` switches in `export_test.go` were updated and remain exhaustive. For a new/changed config key, verify `conf_parse.go` (the single source of truth) accepts it for the right schemas (`http`,`https`,`tcp`,`tcps`,`ws`,`wss`) and that `NewLineSender`'s `With*` option path stays in sync. Verify HTTP auto-negotiates the protocol version while TCP still requires `WithProtocolVersion`/`protocol_version`. Verify exported identifiers carry doc comments and the QuestDB Apache-2.0 license banner heads any new file. + +**Agent 4 — Concurrency & data races:** race conditions on `qwpLineSender` / sender fields, missing synchronization, the producer vs `qwpSfSendLoop` handoff, drainer goroutines vs engine state, `engineAppendBlocking` deadline/backpressure correctness, `sync.Mutex`/`RWMutex` ordering and double-unlock, channel direction/ownership/close discipline, context cancellation racing in-flight flush, `Close()` racing a concurrent `Flush`. Confirm whether `go test -race` would cover the changed paths. For every callsite from 2.5b, check whether the symbol is now reachable from a goroutine/context where the previous synchronization assumptions don't hold. + +**Agent 5 — Resource management & leaks:** goroutine leaks on every path (including early `error` returns and HALT) — every spawned goroutine must have a join/cancel/exit story; connection/socket cleanup on error and reconnect; `Close()` idempotency and that it drains/stops drainers and the send loop; channel close discipline (no leaked blocked senders/receivers); disk-backed segment-file (`*.sfa`) creation/cleanup/locking under `sf_dir` on error paths; context-cancellation propagation freeing resources; buffer/scratch lifecycle. Walk every callsite from 2.5b that constructs or owns a changed type and verify cleanup on all paths (success, `error` early return, panic-unwind, `Close`). + +**Agent 6 — Performance & allocations:** unnecessary allocations on the hot path (`Table`/`Symbol`/`*Column`/`At*` build, flush, QWP encode), excessive copying, inefficient serialization, redundant syscalls, buffer growth strategy. **The `Table`→`Symbol`→`Column`→`At` pipeline is pinned at 0 allocs/op by `BenchmarkQwpSenderSteadyState` / `TestQwpSenderSteadyStateZeroAllocs`** — any new hot-path allocation must move to a reusable scratch buffer on `qwpLineSender` (see the `encodeInfoBuf` pattern). For each new loop on the data path, analyze scaling at realistic volume (millions of rows per flush, hundreds of columns, thousands of symbols); flag any O(n²). Setup-path allocations (construction, conf parsing) are acceptable; data-path allocations are not. + +**Agent 7 — Test review & coverage:** coverage gaps, error-path tests, nil/edge-case tests, boundary conditions, regression tests, test quality. Check: +- Unit tests (`*_test.go`, pure ILP tests; QWP unit tests use the `httptest.Server` stand-in `newQwpTestServer` in `qwp_sender_test.go`) +- Integration tests (`*_integration_test.go`) — these need Docker via testcontainers-go; note the live-server vs testcontainer distinction (QWP integration suites can hit a live `localhost:9000`; `TestIntegrationSuite` and the HTTP/TCP suites spin up a real container) +- testify suites dispatch via the top-level `Test*Suite` entry point plus the method name +- Interop conformance: `interop_test.go` + the `test/interop/questdb-client-test` submodule +- `export_test.go` extended (not production code made public) when tests need new internals +- `BenchmarkQwpSenderSteadyState` still asserts 0 allocs/op if the hot path changed +- `examples/` + `bench/` still build and stay consistent with `examples.manifest.yaml` + +Cross-reference 2.5d: every cross-context exposure should have a test exercising the changed symbol from that context. Missing tests for cross-context callsites are high-priority findings. + +**Agent 8 — Code quality & API design:** exported API ergonomics and consistency, backward compatibility of the `LineSender`/`QwpSender` interfaces and config keys (breaking changes must be intentional and called out in the PR body), naming consistent with the codebase, dead code, unused imports, doc comments on every exported identifier, the Apache-2.0 license banner on new files, the fluent-API error-latching convention preserved on any new method, `go vet ./...` and `staticcheck ./...` clean, `examples.manifest.yaml` paths/filenames stable. + +**Agent 9 — Cross-context caller impact:** Walk the callsite inventory from 2.5b. For every callsite, fetch the surrounding code (the calling function plus its callers up two levels) and answer: + +- Does this caller pass inputs the new behavior handles incorrectly? +- Does this caller depend on a contract from the implicit contract list (2.5c) the change broke? +- Is this caller in a context (the send-loop or drainer goroutine, auto-flush, holding a mutex, an `error`/HALT path, a hot loop, a `WithErrorHandler` callback, TLS handshake, `Close()`, panic-unwind, the conf parser) where the new behavior misbehaves even with valid inputs? +- For changed interface methods: do all seven `LineSender` implementations still satisfy the new contract? Does the `export_test.go` switch stay exhaustive? +- For changed config keys: does `conf_parse.go` stay the single source of truth, and does the `With*` option path agree? +- For changed buffer/sender/cursor state machines: do all callers respect the new state transitions (buffer cleared after error before reuse; sender rebuilt after HALT; cursor frame still self-sufficient for replay)? + +This agent's output is structured per callsite, not per failure mode. Each callsite gets a verdict: SAFE / BROKEN / NEEDS VERIFICATION. Every BROKEN entry is a P0 finding regardless of whether the file is in the diff. Not optional even when the diff is small — small diffs to widely-used symbols (`buffer.column*`, `Flush`, interface methods, the cursor engine) have the largest blast radius. + +**Agent 10 — Fresh-context adversarial:** Dispatched separately from agents 1-9 to escape checklist anchoring. Different rules: + +- It receives ONLY the PR diff and the names of the changed files. It does NOT receive the change surface map, the implicit contract list, the cross-context exposure list, or any checklist below. +- Its sole instruction: "find ways this code is wrong". No category list, no failure-mode taxonomy, no QuestDB-Go style guide. +- It is free to use Read, Grep, and Glob to explore the repository however it wants. +- Findings are not pre-classified. Each states: what's wrong, why it's wrong, and the code path that demonstrates it. + +A finding here that none of agents 1-9 produced is high signal. A finding that overlaps is corroboration. Run in parallel with agents 1-9. Mandatory regardless of diff size. + +Combine all agent findings into a single deduplicated **draft** report. Do NOT present this draft to the user yet — it goes straight into verification. + +## Step 3b: Verify every finding against source code + +The parallel agents work from the diff plus the change surface map and frequently produce false positives — especially around the error-latching convention, goroutine lifecycle, channel ownership, and Go control-flow guarantees. Every finding MUST be verified before it is reported. + +For each finding in the draft report: + +1. **Read the actual source code** at the exact lines cited. Do not rely on the agent's description alone. +2. **Trace the full code path:** follow callers and interface dispatch. A method called on a `LineSender` value may dispatch to any of the seven implementations — check the one(s) actually reachable. +3. **Check the right implementation(s):** if a finding involves an interface method, confirm it against every implementation the callsite can dispatch to, not just one. +4. **For leak claims:** trace every goroutine to its exit, every connection/file to its close, every channel to its close, on ALL paths (success, `error` early return, HALT, panic-unwind, `Close()`). Before claiming a leak between acquisition and cleanup, verify the intervening code can actually fail. +5. **For panic claims:** verify the panic site is actually reachable. Trace control flow backwards — a preceding validation guard (including name-validation rejecting the disallowed-character set), match arm, or early return may make it unreachable. +6. **For goroutine-crash claims:** confirm the panic is reachable on a *client-spawned* goroutine with no top-level `recover`, from contract-honoring input. If a documented validation guard upstream rejects the triggering input, drop it; if the goroutine is the validation boundary, it IS reachable — flag it. +7. **For numeric overflow claims:** check reachability at realistic scale — buffers up to a few hundred MB, millions of rows per flush, columns in the tens to low hundreds, symbol cardinality in the thousands, FSNs growing monotonically over a long-lived sender. If overflow needs values beyond that scale, drop it. +8. **For `unsafe` / race claims:** verify the invariant is actually violated. For races, confirm the two access paths can run concurrently (different goroutines, no intervening happens-before) and whether `go test -race` exercises it. +9. **For error-latching claims:** confirm whether the method follows the intentional fluent-API latching convention (latch, surface on next `At`/`AtNow`/`Flush`). If it does and the error is not lost, it is a FALSE POSITIVE. Only confirm if the error is dropped or never surfaces. +10. **For performance claims:** check whether the cost is measurable on a realistic workload. Downgrade to a nit if negligible relative to surrounding I/O. Exception: any allocation on the pinned 0-alloc hot path is always worth flagging, even a single one — verify against `BenchmarkQwpSenderSteadyState`. +11. **For cross-context findings (Agent 9):** re-read the callsite in full including callers up two levels, and confirm the broken behavior is reachable from production or user-exercised test paths. High-value but easy to overstate — verify carefully. + +**Classify each finding** as: +- **CONFIRMED in-diff** — the bug is real and inside the diff +- **CONFIRMED at out-of-diff callsite** — the bug is in an unchanged file because the changed symbol is used there in a now-broken way (cite the file and the 2.5c contract violated) +- **FALSE POSITIVE** — the code is actually correct (explain why) +- **CONFIRMED with nuance** — the issue exists but is less severe than stated (explain) + +**Move false positives to a separate "Downgraded" section** at the end. For each, give a one-line explanation of why it was dismissed. This lets the PR author verify the reasoning and catch verification mistakes. + +Launch verification agents in parallel where findings are independent. Each should read surrounding source files, not just the diff. + +## Review checklists + +Review the diff for: + +### Correctness & bugs +- nil handling at API boundaries (nil receiver, nil slice/map, nil context, nil channel) +- Edge cases and error paths +- Logic errors, off-by-one, incorrect bounds, wrong operator precedence +- Integer overflow/truncation (buffer size math, length prefixes/varints, FSN/sequence arithmetic) +- Correct ILP wire format per protocol version (V1 text-only, V2 binary float64 + n-dim arrays, V3 decimals) and correct QWP frame/codec bytes +- **Reachability expansion:** for each changed symbol, list the goroutines, error/HALT paths, mutex-held states, and transports it can now appear in but didn't before. Verify it works in each. + +### Panic & crash surface +A panic on a client-spawned goroutine aborts the host process. Check for: +- nil deref, out-of-bounds slice/index, nil-map write, `, ok`-less type assertion, divide-by-zero, `make`/slice-grow sized by an untrusted length +- Channel misuse: send-on-closed, double-close, close-from-wrong-side (send loop, drainers, `Close()`) +- Panics in `qwpSfSendLoop` / drainers / auto-flush / any client goroutine with no top-level `recover` — the Go analog of a panic crossing FFI +- Panic-in-`defer` during unwind +- `unsafe`/`reflect`/pointer-arithmetic soundness and alignment +- Unbounded recursion on server/attacker-controlled depth + +### Concurrency +- Data races on sender/engine state (would `go test -race` catch it?) +- Producer vs send-loop handoff; drainer vs engine; `engineAppendBlocking` backpressure/deadline correctness +- Mutex ordering, double-unlock, lock held across a blocking channel op or I/O +- Channel direction/ownership/close discipline; no leaked blocked goroutines +- Context cancellation/deadline racing in-flight flush; `Close()` racing `Flush` +- For every changed symbol, whether it is now reachable from a goroutine/context where prior synchronization assumptions don't hold + +### Public API & interface conformance +- Every changed `LineSender`/`QwpSender` method correct across all seven implementations +- New/changed ILP column type updates all six structs **and** the exhaustive `export_test.go` switches +- New/changed config key added to `conf_parse.go` (single source of truth) for the right schemas, with `With*` option parity +- HTTP still auto-negotiates protocol version; TCP still requires explicit selection +- Backward compatibility of interfaces/config keys; breaking changes intentional and called out +- Exported identifiers documented; Apache-2.0 banner on new files; fluent-API error-latching preserved on new methods +- `LineSenderPool` stays HTTP-only (`errHttpOnlySender` intact) + +### QWP protocol & error semantics +- Cursor frames remain self-sufficient (full schema + symbol dictionary from id 0 every flush) so reconnect/replay from `engineAckedFsn()+1` and orphan adoption stay safe +- `Flush` blocking contract preserved (blocks until `engineAckedFsn` catches `enginePublishedFsn`); auto-flush stays non-blocking via `enqueueCursor`; `FlushAndGetSequence` returns the published FSN upper bound +- Error-policy precedence intact: `WithErrorPolicyResolver` → `WithErrorPolicy` → `on_*_error` → `on_server_error` → defaults; `PROTOCOL_VIOLATION`/`UNKNOWN` always HALT +- HALT latches on the I/O loop and surfaces on the next producer call; no auto-resume (close+rebuild is the only recovery) +- Disk-backed segment files under `sf_dir` stay on-disk-compatible with the Java `MmapSegment.java` layout + +### Performance +- No new allocations on the pinned 0-alloc hot path (`Table`/`Symbol`/`*Column`/`At*`) — verify against `BenchmarkQwpSenderSteadyState`; new hot-path scratch must reuse a buffer on `qwpLineSender` +- No regressions on flush/encode paths; minimal copying; sane buffer growth; batched syscalls +- No O(n²) on any data path at realistic scale (millions of rows, hundreds of columns) +- Setup-path allocations (construction, conf parsing) acceptable; data-path allocations not + +### Resource management +- Every client-spawned goroutine has a join/cancel/exit story on all paths +- Connections/sockets/TLS sessions and `*.sfa` segment files cleaned up on error and reconnect +- `Close()` idempotent; stops the send loop and drainers; drains or fails cleanly +- No leaked channels or blocked goroutines; context cancellation frees resources + +### Test review +- **Coverage gaps:** every new/changed code path has a test; flag "missing test for X" explicitly +- **Cross-context coverage:** every 2.5d entry exercised by a test from that context (missing = high priority) +- **Error-path coverage:** connection drop, partial write, TLS/auth failure, server `*SenderError`, reconnect/replay, HALT, context cancellation — not just the happy path +- **Edge-case tests:** nil inputs, empty buffers, zero-length strings, max-length/disallowed-character names, NaN/Inf, boundary integers +- **Integration tests:** protocol-level changes covered (Docker/testcontainers; mind live-server vs container); interop vectors in `interop_test.go` + submodule still pass +- **Test quality:** assertions check the right thing; no trivially-passing tests; `export_test.go` extended rather than production code made public; benchmark 0-alloc assertion preserved +- **Regression tests:** a bug fix ships a test that fails without the fix + +### Unresolved TODOs and FIXMEs +- Scan the diff for `TODO`, `FIXME`, `HACK`, `XXX`, `WORKAROUND`. For each: + - Pre-existing (moved/reformatted) or newly introduced in this PR? + - If new: unfinished work that should block merge, or an acceptable known limitation? Flag deferred bugs / incomplete implementations. + - If it references a ticket/issue, verify the reference exists. + +### Commit messages +- Plain English titles, under 50 chars +- Active voice, naming the acting subject + +## Step 4: Output + +Present ONLY verified findings (false positives are excluded from Critical/Moderate/Minor). Structure as: + +### Critical +Issues that must be fixed before merge. Each must include: +- Exact file path and line numbers (including out-of-diff files) +- Whether the finding is **in-diff** or **out-of-diff** +- Code path trace showing why the bug is real +- For out-of-diff findings: the contract from 2.5c that was violated and the callsite that triggers it +- Suggested fix + +### Moderate +Issues worth addressing but not blocking. + +### Minor +Style nits and suggestions. + +### Downgraded (false positives) +Findings from the initial review dismissed after source verification. For each: +- The original claim (one line) +- Why it was dismissed (one line, citing the specific code that disproves it) + +### Summary +- One-line verdict: approve, request changes, or needs discussion +- Highlight any regressions or tradeoffs +- State how many draft findings were verified vs dropped as false positives (e.g., "8 findings verified, 4 false positives removed") +- State the in-diff vs out-of-diff split (e.g., "5 findings in-diff, 3 findings out-of-diff"). If the diff is non-trivial and out-of-diff is zero, the cross-context pass likely underran — re-invoke Agent 9 with a wider grep before finalizing. diff --git a/.github/workflows/binary-check.yml b/.github/workflows/binary-check.yml new file mode 100644 index 00000000..061a326f --- /dev/null +++ b/.github/workflows/binary-check.yml @@ -0,0 +1,60 @@ +name: Binary check + +on: + pull_request: + push: + branches: + - main + +jobs: + reject-binaries: + name: Reject committed executables + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Scan for executable binaries + run: | + set -euo pipefail + if [ "${{ github.event_name }}" = "pull_request" ]; then + base="${{ github.event.pull_request.base.sha }}" + head="${{ github.event.pull_request.head.sha }}" + list_cmd=(git diff --name-only --diff-filter=AMRC "$base...$head") + scope="PR diff ($base..$head)" + else + list_cmd=(git ls-files) + scope="full tree" + fi + count=0 + violations=() + while IFS= read -r f; do + count=$((count + 1)) + [ -f "$f" ] || continue + mime=$(file --brief --mime-type -- "$f" 2>/dev/null || true) + case "$mime" in + *application/x-mach-binary*|\ + *application/x-executable*|\ + *application/x-pie-executable*|\ + *application/x-sharedlib*|\ + *application/x-dosexec*) + summary=$(printf '%s\n' "$mime" | head -n1) + violations+=("$f — $summary") + ;; + esac + done < <("${list_cmd[@]}") + echo "Scanned $scope: $count file(s)" + if [ ${#violations[@]} -gt 0 ]; then + echo "::error::Committed executable binaries detected:" + for v in "${violations[@]}"; do + echo " - $v" + done + echo + echo "Build artifacts must not be committed. For example dirs, use" + echo "'go run .' (no artifact) or 'go install' with GOBIN pointing" + echo "at a gitignored directory." + exit 1 + fi + echo "OK: no committed executables." diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index eb09a773..73b3714b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,6 +2,88 @@ name: build on: [push] jobs: + # Cross-repo trigger: fire the questdb-enterprise + # build-and-test-e2e-go-client pipeline with this build's SHA so + # the go_client-marked failover tests run against an Enterprise + # primary. The Enterprise pipeline posts a status check back on the + # PR (`enterprise-e2e-go-client` context); this job fire-and-forgets. + dispatch-enterprise-e2e: + runs-on: ubuntu-latest + if: ${{ !github.event.repository.fork }} + steps: + - name: Queue enterprise go_client e2e + env: + ENT_DISPATCH_PAT: ${{ secrets.ENT_DISPATCH_PAT }} + ENT_ORG_URL: ${{ secrets.ENT_ORG_URL }} + run: | + set -euo pipefail + + if [ -z "${ENT_DISPATCH_PAT:-}" ] || [ -z "${ENT_ORG_URL:-}" ]; then + echo "ENT_DISPATCH_PAT or ENT_ORG_URL not set; skipping enterprise e2e dispatch." + exit 0 + fi + + PROJECT="questdb-enterprise" + PIPELINE_NAME="build-and-test-e2e-go-client" + + echo "Looking up '${PIPELINE_NAME}' in '${PROJECT}'..." + PIPELINES=$(curl -fsS -u ":${ENT_DISPATCH_PAT}" \ + "${ENT_ORG_URL}${PROJECT}/_apis/pipelines?api-version=7.0") + PIPELINE_ID=$(echo "$PIPELINES" | jq -r --arg n "$PIPELINE_NAME" \ + '.value[] | select(.name == $n) | .id' | head -1) + + if [ -z "$PIPELINE_ID" ] || [ "$PIPELINE_ID" = "null" ]; then + echo "Pipeline '${PIPELINE_NAME}' not found in '${PROJECT}'; skipping." + exit 0 + fi + + CLIENT_BRANCH="${GITHUB_REF_NAME}" + CLIENT_PR_NUMBER="" + + # Azure DevOps rejects empty-string template parameters with + # "The 'X' parameter is not a valid String." even when the + # pipeline YAML declares `type: string, default: ''`. Omitting + # the key entirely lets the YAML default kick in; the + # enterprise pipeline already handles empty goClientPrNumber + # by skipping the GitHub PR status post. + BODY=$(jq -n \ + --arg commit "${GITHUB_SHA}" \ + --arg pr "${CLIENT_PR_NUMBER}" \ + --arg branch "${CLIENT_BRANCH}" \ + '{ + templateParameters: ({ + goClientCommit: $commit, + goClientPrNumber: $pr, + clientBranch: $branch + } | with_entries(select(.value != ""))), + resources: { repositories: { self: { refName: "refs/heads/main" } } } + }') + + echo "Dispatching enterprise e2e: commit=${GITHUB_SHA} branch=${CLIENT_BRANCH}" + + # Capture status and body separately so we can surface Azure + # DevOps's error message on 4xx. With -fsS the body is dropped + # and only "curl: (22) ... error: 400" reaches the log, which + # is useless for diagnosing parameter / YAML mismatches. + RESPONSE_FILE=$(mktemp) + HTTP_STATUS=$(curl -sS -u ":${ENT_DISPATCH_PAT}" \ + -H "Content-Type: application/json" \ + -X POST \ + -d "$BODY" \ + -o "$RESPONSE_FILE" \ + -w '%{http_code}' \ + "${ENT_ORG_URL}${PROJECT}/_apis/pipelines/${PIPELINE_ID}/runs?api-version=7.0") + + if [ "$HTTP_STATUS" -lt 200 ] || [ "$HTTP_STATUS" -ge 300 ]; then + echo "Dispatch failed with HTTP ${HTTP_STATUS}. Response body:" + cat "$RESPONSE_FILE" + echo "" + exit 1 + fi + + RUN_URL=$(jq -r '._links.web.href // ""' "$RESPONSE_FILE") + echo "Enterprise build queued: ${RUN_URL}" + test: runs-on: ubuntu-latest strategy: @@ -21,10 +103,57 @@ jobs: cache: true - name: Run vet + # Pin to the matrix-installed Go. Without this, a stray + # `toolchain` line re-added to go.mod by `go mod tidy` on a + # newer box would silently switch this job off the matrix + # version; `local` makes that fail loudly instead. + env: + GOTOOLCHAIN: local run: go vet ./... - name: Run Staticcheck run: go run honnef.co/go/tools/cmd/staticcheck@v0.7.0 ./... + - name: Build bench modules + # bench/* are separate Go modules with their own go.mod, so + # the root `go vet`/`go test ./...` above never touches them. + # Project convention is that bench/ builds. `go mod tidy + # -diff` fails (with an actionable diff) on go.mod/go.sum + # drift — e.g. an indirect dep pulled transitively from the + # root via `replace => ../..` but missing from the bench + # go.mod; `go build ./...` then proves they compile. The loop + # globs bench/*/go.mod so new bench modules are auto-gated. + # GOTOOLCHAIN=local pins to the matrix Go (see "Run vet"); the + # bench go.mod `go` directive is 1.23, satisfied by both + # matrix versions. + env: + GOTOOLCHAIN: local + run: | + set -euo pipefail + for mod in bench/*/go.mod; do + dir=$(dirname "$mod") + echo "::group::$dir" + ( cd "$dir" && go mod tidy -diff && go build ./... ) + echo "::endgroup::" + done + + - name: Run zero-alloc invariant (non-race) + # The QWP hot-path 0-allocs/op pins (the Test*ZeroAllocs cases in + # qwp_bench_test.go) self-skip under -race: race instrumentation + # forces stack-allocatable values to escape and inflates + # allocs/op. The "Run tests" step below is -race only, so without + # this dedicated non-race run the headline allocation-free + # guarantee would never actually be exercised in CI. Cheap — a + # handful of testing.AllocsPerRun loops, no network or Docker. + # GOTOOLCHAIN=local pins to the matrix Go (see "Run vet"). + env: + GOTOOLCHAIN: local + run: go test -run ZeroAllocs -count=1 . + - name: Run tests - run: go test -v ./... + # Pin to the matrix-installed Go (see "Run vet"). The + # Staticcheck step deliberately omits this: staticcheck@v0.7.0 + # needs go1.25 to build and must stay on GOTOOLCHAIN=auto. + env: + GOTOOLCHAIN: local + run: go test -race -v ./... diff --git a/.github/workflows/qwp-fuzz.yml b/.github/workflows/qwp-fuzz.yml new file mode 100644 index 00000000..c28319a7 --- /dev/null +++ b/.github/workflows/qwp-fuzz.yml @@ -0,0 +1,127 @@ +# Builds QuestDB from source and runs the Go QWP fuzz suite against it. +# +# Modelled on c-questdb-client's ci/run_fuzz_pipeline.yaml (clone questdb, +# build with Maven, point the test fixture at the built repo). The fixture +# (qwp_fuzz_fixture_test.go) then launches the SNAPSHOT jar exactly as +# system_test/fixture.py does. +# +# This job MUST actually build, start the server, and run the fuzz tests — +# never silently skip. Two guards enforce that: +# * QDB_FUZZ_STRICT=1 turns an unresolved/unstartable server from a +# green t.Skip into a red t.Fatal (see fuzzStrict in the fixture). +# * the explicit "Verify QuestDB jar" step fails the job with an +# actionable message if the build produced no server jar. +# The regular build.yml workflow is unaffected: it sets neither QDB_REPO +# nor QDB_FUZZ_STRICT, so the fuzz tests skip cleanly there. +# +# Runs on every pull request (and on demand). The QuestDB build plus +# first-time dependency download takes a few minutes; the ~/.m2 cache +# below keeps repeat runs short. +name: qwp-fuzz + +on: + workflow_dispatch: + pull_request: + +jobs: + qwp-fuzz: + name: QWP fuzz vs QuestDB master + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Checkout go-questdb-client + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: "1.24.x" + cache: true + + # QuestDB's maven-enforcer-plugin pins java.enforce.version=25 + # (core/pom.xml); building with anything else fails the build. + # c-questdb-client uses JDK 25 for the same reason. + - name: Install JDK 25 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "25" + + - name: Clone QuestDB + # Shallow clone of master. The C submodules ship prebuilt, so no + # blanket submodule init is needed; the java-questdb-client + # submodule is fetched on demand by the detect step below. + run: git clone --depth 1 https://github.com/questdb/questdb.git + + - name: Detect local client profile + # core/pom.xml has a test-scoped dependency on + # org.questdb:questdb-client, and `package -DskipTests` still + # compiles tests, so Maven must resolve it. A release version + # resolves from Maven Central; a -SNAPSHOT version (server and + # client evolving in lockstep) exists only as the + # java-questdb-client submodule, built in-reactor via the + # local-client profile. Mirrors the server repo's + # .github/actions/detect-local-client composite action. + id: client + run: | + set -euo pipefail + version="$(sed -n 's/.*\(.*\)<\/questdb.client.version>.*/\1/p' questdb/core/pom.xml | head -1)" + echo "questdb.client.version=${version:-}" + if [[ "$version" == *-SNAPSHOT ]]; then + echo "SNAPSHOT client version — building the java-questdb-client submodule via -Plocal-client" + git -C questdb submodule update --init java-questdb-client + echo "client_profile=-Plocal-client" >> "$GITHUB_OUTPUT" + else + echo "Release client version — resolving from Maven Central" + echo "client_profile=" >> "$GITHUB_OUTPUT" + fi + + # QuestDB pulls a large, slow-moving dependency set; cache ~/.m2 so + # repeat runs skip the multi-minute first-time download. Key + # rotates when this workflow changes. + - name: Cache Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: m2-questdb-${{ runner.os }}-${{ hashFiles('.github/workflows/qwp-fuzz.yml') }} + restore-keys: | + m2-questdb-${{ runner.os }}- + + - name: Build QuestDB server jar + # Minimal, verified build: produces core/target/questdb- + # -SNAPSHOT.jar + core/target/classes/.../site/conf in ~30s + # (warm .m2). With local-client active, -am additionally builds + # the java-questdb-client module ahead of core (~10s). No + # -Pbuild-web-console: the embedded console UI is irrelevant to + # QWP / /exec / /ping, and skipping it removes the Node-download + # failure surface. JAVA_HOME is exported by setup-java; the + # enforcer verifies it is JDK 25. + run: mvn -B -ntp -DskipTests -pl core -am package -f questdb/pom.xml ${{ steps.client.outputs.client_profile }} + + - name: Verify QuestDB jar exists + # Defense in depth: if Maven "succeeded" but emitted no server + # jar (wrong module, layout change), fail HERE with a precise + # message instead of letting the fuzz step skip/fatal opaquely. + run: | + set -euo pipefail + jar="$(ls questdb/core/target/questdb-*-SNAPSHOT.jar 2>/dev/null \ + | grep -v -- '-tests.jar' | head -n1 || true)" + if [ -z "$jar" ]; then + echo "::error::No QuestDB server jar under questdb/core/target — the build did not produce it." + ls -la questdb/core/target/ || true + exit 1 + fi + echo "Found server jar: $jar" + + - name: Run QWP fuzz + integration tests + env: + GOTOOLCHAIN: local + QDB_REPO: ${{ github.workspace }}/questdb + # Make a missing/unstartable server a hard failure, not a skip. + # Applies to both ^TestQwpFuzz and ^TestQwpIntegration: the + # integration suite now boots the same shared fuzz fixture + # instead of probing an absent localhost:9000 server. + QDB_FUZZ_STRICT: "1" + run: go test -count=1 -timeout 30m -run '^TestQwp(Fuzz|Integration)' -v . diff --git a/.gitignore b/.gitignore index f5eee09d..51c7c6cd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,28 @@ # Test binary, built with `go test -c` *.test +# Go build artifacts from the `package main` dirs (each `go build` +# produces a binary named after its directory). binary-check.yml is the +# CI backstop; these keep `git status` clean locally. +/bench/qwp-egress-read/qwp-egress-read +/bench/qwp-egress-read-wide/qwp-egress-read-wide +/examples/from-conf/from-conf +/examples/tcp/basic/basic +/examples/tcp/auth/auth +/examples/tcp/auth-and-tls/auth-and-tls +/examples/http/basic/basic +/examples/http/auth/auth +/examples/http/auth-and-tls/auth-and-tls +/examples/qwp/basic/basic +/examples/qwp/basic-query/basic-query +/examples/qwp/sf/sf +/system_test/enterprise_e2e/sidecar/sidecar + +# Python caches from the system_test/enterprise_e2e fixtures +__pycache__/ +.pytest_cache/ +.venv/ + # Output of the go coverage tool, specifically when used with LiteIDE *.out diff --git a/CLAUDE.md b/CLAUDE.md index 19d95786..d1afb824 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,249 +1,217 @@ # CLAUDE.md -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +This file provides guidance to Claude Code (claude.ai/code) when working with +code in this repository. It captures invariants and "where to look" pointers — +for specifics (file contents, constants, config-key catalog, error categories) +read the code, which is authoritative. ## Project -Go client library for QuestDB ingestion. Three transports are supported: +Go client library for QuestDB ingestion. Three transports: -- **HTTP / HTTPS** — InfluxDB Line Protocol (ILP), recommended for most workloads. -- **TCP / TCPS** — ILP over raw TCP, kept for low-overhead deployments. -- **WS / WSS (QWP)** — QuestDB's binary **columnar** wire protocol over - WebSocket. Higher throughput than ILP for wide rows, and the only - transport that exposes the full QuestDB type system (int8/int16/int32, - float32, char, date, timestamp-nanos, uuid, varchar, geohash, int64 - arrays). +- **HTTP / HTTPS** and **TCP / TCPS** — the legacy InfluxDB Line Protocol (ILP). +- **WS / WSS (QWP)** — QuestDB's binary columnar wire protocol over WebSocket. + The only transport exposing the full type system (int8/16/32, float32, char, + date, timestamp-nanos, uuid, varchar, geohash, int64 arrays). **QWP is not a + version of ILP** — distinct framing, codecs, and server handshake. -Module path: `github.com/questdb/go-questdb-client/v4` — the `/v4` segment -is load-bearing; keep the suffix when importing within this repo. Minimum -Go version: **1.23** (go.mod pins `go 1.23` with a `1.24.4` toolchain -directive). +Module path: `github.com/questdb/go-questdb-client/v4` — the `/v4` segment is +load-bearing when importing within this repo. Minimum Go: 1.23 (go.mod pins +`go 1.23` with a `1.24.4` toolchain). ## Commands ```bash -# Fetch the interop test vectors (required for interop_test.go). +# Required for interop_test.go. git submodule update --init --recursive # Static analysis (run by CI). go vet ./... go run honnef.co/go/tools/cmd/staticcheck@v0.7.0 ./... -# Full test suite. Integration tests (both ILP and QWP) spin up QuestDB -# containers via testcontainers-go, so Docker must be running locally. +# Tests. Integration suites spin up QuestDB containers via +# testcontainers-go, so Docker must be running for those. go test -v ./... -# Run a single test or suite. testify suites are dispatched via the -# top-level Test*Suite entry point plus the method name. +# Single suite — testify suites dispatch via the top-level +# Test*Suite entry point plus the method name. go test -v -run TestIntegrationSuite/TestE2EValidWrites . go test -v -run TestQwpIntegrationSuite . -go test -v -run TestHttpHappyCasesFromConf . -# Benchmarks — the QWP hot path is allocation-tracked. +# Allocation-tracked benchmark on the QWP hot path. go test -v -bench BenchmarkQwpSenderSteadyState -benchmem -run ^$ . ``` -There is no Makefile or build step — consumers import the package -directly. The `examples/` tree (`from-conf`, `http/...`, `tcp/...`) holds -compilable sample `main.go` files referenced by `examples.manifest.yaml`, -which questdb.io uses to render docs, so keep paths and filenames stable -when editing examples. +`examples/` ships compilable `main.go` files referenced by +`examples.manifest.yaml`, which questdb.io uses to render docs — keep paths and +filenames stable. ## Architecture -The public surface is the `LineSender` interface defined in `sender.go`. -All fluent builder methods (`Table`, `Symbol`, `*Column`, `At`, `AtNow`, -`Flush`, `Close`) are declared there; every transport implementation must -satisfy it. QWP adds a **superset** interface `QwpSender` (in -`qwp_sender.go`) with the extra column types listed above — callers that -want QWP-only columns must type-assert the returned sender to -`QwpSender`. - -### Transports and protocol versions - -Two factories are the only entry points: - -- `LineSenderFromConf(ctx, "schema::addr=...;key=value;...")` — parses - the config string in `conf_parse.go`. Supported schemas: `http`, - `https`, `tcp`, `tcps`, `ws`, `wss`. -- `NewLineSender(ctx, opts...)` — functional options. One of `WithHttp`, - `WithTcp`, or `WithQwp` is required; a missing sender type returns - *"sender type is not specified: use WithHttp, WithTcp, or WithQwp"*. - `NewLineSender` makes two passes over the options: the first discovers - the transport so per-transport defaults can be applied, the second - applies every option against the seeded config. - -Both funnel through `lineSenderConfig` and `newLineSender` in -`sender.go`, which dispatches to per-transport sanitizers -(`sanitizeHttpConf`, `sanitizeTcpConf`, `sanitizeQwpConf`) and -constructors (`newHttpLineSender`, `newTcpLineSender`, -`newQwpLineSenderFromConf`). - -**ILP protocol versions.** HTTP and TCP transports each have three -concrete structs, one per ILP protocol version: V1 is text-only, V2 adds -binary `float64` and n-dimensional `float64` arrays, V3 adds decimals. - -- `httpLineSender`, `httpLineSenderV2`, `httpLineSenderV3` — `http_sender.go` -- `tcpLineSender`, `tcpLineSenderV2`, `tcpLineSenderV3` — `tcp_sender.go` - -HTTP auto-negotiates the protocol version with the server; TCP requires -`WithProtocolVersion(ProtocolVersion2|3)` or -`protocol_version=2|3` in the config string. When adding a new column -type or ILP feature, expect to touch all six ILP structs, the -`LineSender` interface, `buffer.go` (raw ILP encoding), and the -`Messages` / `MsgCount` / `BufLen` / `ProtocolVersion` switch helpers in -`export_test.go`. +Public surface: `LineSender` interface in `sender.go`. Every transport satisfies +it. `QwpSender` (in `qwp_sender.go`) is a superset for QWP-only column types — +callers wanting them must type-assert. + +Two entry points: `LineSenderFromConf(ctx, "schema::addr=...;key=value")` +(parser in `conf_parse.go`; schemas: `http`, `https`, `tcp`, `tcps`, `ws`, +`wss`) and `NewLineSender(ctx, opts...)` (requires one of `WithHttp`, `WithTcp`, +`WithQwp`). Both funnel through `lineSenderConfig` and `newLineSender` in +`sender.go`. **`conf_parse.go` is the single source of truth for supported +config keys.** + +### ILP (HTTP / TCP) + +Three protocol versions: V1 text-only, V2 adds binary `float64` and +n-dimensional float arrays, V3 adds decimals. Each transport has three concrete +structs — `httpLineSender{,V2,V3}` in `http_sender.go`, `tcpLineSender{,V2,V3}` +in `tcp_sender.go`. + +HTTP auto-negotiates the version; TCP requires `WithProtocolVersion(...)` or +`protocol_version=2|3`. **Adding a new ILP column type or feature touches all +six structs**, the `LineSender` interface, `buffer.go` (raw ILP encoding), and +the `Messages` / `MsgCount` / `BufLen` / `ProtocolVersion` switch helpers in +`export_test.go`. Keep those switches exhaustive. ### QWP (WebSocket columnar protocol) -QWP is not a version of ILP — it is a distinct binary protocol with its -own framing, codecs, and server handshake. Everything QWP lives in -`qwp_*.go`: - -- `qwp_constants.go` — magic (`"QWP1"`), header flags (Gorilla timestamp - encoding, delta symbol dictionary), type codes, and ACK status codes. -- `qwp_wire.go` — low-level wire primitives; little-endian fixed-width - writers and unsigned LEB128 varint encoding. -- `qwp_buffer.go` — `qwpColumnBuffer` (per-type columnar storage, - bit-packed booleans, offset+data for strings, separate null bitmap) - and `qwpTableBuffer` (gap-fill, row cancel, per-table schema id). - This replaces the ILP text buffer for QWP senders; the same hot-path - discipline applies but the data is stored in columnar form until the - encoder serializes a batch. Null-handling strategy mirrors the Java - client: wide types (INT, LONG, FLOAT, DOUBLE, TIMESTAMP, - TIMESTAMP_NANOS, DATE, STRING, VARCHAR, SYMBOL, UUID, LONG256, - DECIMAL*, DOUBLE_ARRAY, LONG_ARRAY) use the null bitmap path - (`nullable=true`); narrow types (BOOLEAN, BYTE, SHORT, CHAR) plus - GEOHASH use a type-specific sentinel and emit `null_flag=0`. The - bitmap is grown lazily only when a null is marked, so - `len(nullBitmap)` may be less than `ceil(rowCount/8)` when trailing - rows are non-null. -- `qwp_encoder.go` — builds a multi-table QWP message from a set of - table buffers in one flush. -- `qwp_gorilla.go` — delta-of-delta timestamp compression. Encoder - emits a 1-byte encoding flag (`0x00` uncompressed, `0x01` Gorilla) - only when `FLAG_GORILLA` is set on the message header. Falls back - to uncompressed when the column has ≤ 2 non-null values or any DoD - exceeds int32. -- `qwp_transport.go` — WebSocket transport built on - `github.com/coder/websocket` (the only non-stdlib runtime dependency - for QWP). Performs the `/write/v4` HTTP upgrade with QWP version - negotiation headers (`X-QWP-Max-Version`, `X-QWP-Client-Id`). Reads - 9-byte ACK frames (1-byte status + 8-byte cumulative sequence - number). Supports an optional dump writer that records all outgoing - bytes including the HTTP upgrade handshake. -- `qwp_errors.go` — `QwpError` with typed status codes parsed from ACKs. -- `qwp_sender.go` — `qwpLineSender` (implements both `LineSender` and - `QwpSender`), with *double-buffered* encoders so async mode can encode - batch N+1 while batch N is flying. Sync mode uses only `encoders[0]`. - Schema IDs are small integers allocated sequentially from - `nextSchemaId` and stored on each `qwpTableBuffer`; a batch uses - *reference mode* when the table's `schemaId <= maxSentSchemaId`, - otherwise *full mode*. A column-set change resets the table's - `schemaId` to `-1` so a fresh ID is allocated. -- `qwp_sender_async.go` — `qwpAsyncState`, the dedicated I/O goroutine - (`ioLoop`), and the non-blocking-enqueue / blocking-drain split. - Cancellable via context; `Close()` waits up to `closeTimeout` - (default 5s) before force-cancelling. - -Async mode is the default: the QWP sender is seeded with -`qwpDefaultInFlightWindow = 128`. Override with `WithInFlightWindow(n)` -or `in_flight_window=n` in the config. `WithInFlightWindow(1)` forces -synchronous mode — each `Flush` blocks until the ACK arrives. - -Delta symbol dictionaries send only new symbols since the last cache -advance. Cache-advancement timing differs by mode and mirrors the Java -client: - -- **Sync mode**: `maxSentSchemaId` / `maxSentSymbolId` advance only - after the server ACKs the batch. A failed flush leaves the caches - untouched, so a retry re-sends the full schema and the symbol delta. -- **Async mode**: caches advance immediately after a successful - *enqueue*, not after the ACK. Safety comes from the sender being - terminal on I/O error — if any in-flight batch fails, `asyncState.ioErr` - is set and every subsequent user-facing call returns that error, so - stale cache state can never reach the wire. - -### Config string reference - -`conf_parse.go` is the single source of truth for supported keys. -Non-obvious behaviors: - -- `username`, `password`: Basic auth for HTTP **and QWP**; for TCP, - `username` is the ECDSA key ID and `token` is the secret (`D`) value. -- `token`: Bearer token for HTTP and QWP; ECDSA secret for TCP. -- `in_flight_window`, `close_timeout` (ms): QWP-only. -- `protocol_version=auto|1|2|3`: ILP-only. -- `tls_roots`, `tls_roots_password`: explicitly rejected — the Go - client uses the system cert pool via `crypto/tls` defaults. +Everything QWP lives in `qwp_*.go`. The buffer (`qwp_buffer.go`), encoder +(`qwp_encoder.go`), wire primitives (`qwp_wire.go`), and transport +(`qwp_transport.go`) form the columnar codec stack. The sender (`qwp_sender.go` ++ `qwp_sender_cursor.go`) implements `LineSender` and `QwpSender` on top of it. + +**All wire I/O — memory-backed *and* disk-backed — goes through the cursor +engine + send loop** in `qwp_sf_*.go`. `sf_dir` empty selects memory-backed +segments; set selects disk-backed under `//*.sfa` (that +per-sender directory is itself the slot — there is no extra slot level), +on-disk-compatible with the Java client's `MmapSegment.java`. The producer +encodes a batch into `qwpSfCursorEngine` via `engineAppendBlocking`; the +`qwpSfSendLoop` goroutine drains it to the WebSocket, parses ACKs, advances +`engineAckedFsn`, and owns reconnect + replay from `engineAckedFsn() + 1`. + +**Cursor frames are self-sufficient** — full schema definitions plus the full +symbol dictionary from id 0, every flush. This is what makes +reconnect/replay/orphan-adoption safe across a fresh server connection. + +**The wire carries no schema id and no schema mode byte.** A table block is +`table_name, row_count, col_count, inline columns, column data`; the inline +column definitions are the authoritative schema, repeated on every frame. There +is no `nextSchemaId` accumulator on the sender, no per-table `schemaId` field on +the table buffer, no schema-change detection, and no reference mode. (QWP once +carried a mode byte + schema id plus a schema-reference optimisation; it was +removed across the server and all clients.) On egress, the decoder parses the +schema from the first `RESULT_BATCH` of a query (`batch_seq == 0`) into +`qwpQueryDecoder.querySchema` and reuses it for that query's continuation +batches; `qwpEgressIO.dispatcherRun` calls `resetQuerySchema` at the start of +every query so a schema never leaks across query boundaries. + +Symbol-dict tracking (`maxSentSymbolId`, `batchMaxSymbolId`) is still in place, +and both fields are load-bearing. The encoder always passes `-1` as the +`maxSentId` arg of `encodeMultiTableWithDeltaDict` to force "full dict from id +0", but `batchMaxSymbolId` is the separate `batchMaxId` arg and bounds the dict +actually written: `writeDeltaDict` emits `globalDict[0..batchMaxSymbolId]`, so +dropping it would silently truncate the symbol dict. `maxSentSymbolId` is the +cross-flush high-water mark that `resetAfterFlush` rewinds `batchMaxSymbolId` to +(never to `-1`), so a later batch reusing only earlier symbols still writes the +full dict its rows reference. Both are also read by tests and external +observers, but that is incidental to their wire role. + +`WithInFlightWindow(n)` / `in_flight_window=n` is **retained but a no-op** in +the cursor architecture — backpressure is governed by the engine's segment-ring ++ `engineAppendBlocking` deadline. + +### Java-parity QWP knobs (not in connect-string.md) + +These connect-string keys are recognised by the Java client +(`Sender.java`) but are not listed in the +[native-client spec](https://github.com/questdb/questdb-enterprise/blob/main/questdb/docs/qwp/connect-string.md). +We accept them for Java-parity portability — a connect string that +works on the Java client must work here. None should ever be +considered for removal without a matching change in Java: + +- `gorilla=on|off` — gates the Gorilla timestamp encoding in + `qwp_encoder.go` (FLAG_GORILLA). Default `on`. +- `in_flight_window=N` — see the "retained but a no-op" note above. + +`close_timeout=N` (millisecond integer) was a v4.0–v4.5 Go-only key +for the memory-mode close path. The cursor architecture unified +memory and SF onto `close_flush_timeout_millis`, which the spec +also defines. The parser now rejects `close_timeout=` with a +migration hint pointing at `close_flush_timeout_millis`. +`WithCloseTimeout(d)` is retained as a deprecated alias that routes +positive durations through `close_flush_timeout_millis`; new code +should use `WithCloseFlushTimeout` directly. + +Flush semantics: `Flush` / `FlushAndGetSequence` **never wait for the server +ACK** — they return once the batch is published into the cursor engine (in-RAM +for memory mode, on-disk for SF) and the send loop delivers + replays it in the +background. This matches the Java spec (`design/qwp-cursor-durability.md` +decision #1: "flush() never waits for ACK; ACKs are async") and is uniform +across both the pending-rows and zero-pending branches and auto-flush — all +route through `enqueueCursor`; explicit `Flush` only additionally surfaces a +latched send-loop error eagerly. (`Flush` was an ACK barrier +through v4.2.0; that contract was dropped when the cursor/SF architecture made +local persistence, not the ACK, the durability guarantee.) `FlushAndGetSequence` returns the +published FSN — the upper bound of any `SenderError.ToFsn` for that batch; +**pair it with `AwaitAckedFsn` for server-ACK confirmation** (the dedicated +primitive now that `Flush` no longer blocks on ACKs). + +Orphan-slot adoption (SF mode, `drain_orphans=on`) is implemented in +`qwp_sf_orphan.go` + `qwp_sf_drainer.go` + `qwp_sf_round_walk.go`; drainers run +in dedicated goroutines and are visible via `QwpSender.BackgroundDrainers()`. + +### Error handling + +QWP server rejections surface as `*SenderError` (`sender_error.go` is canonical +for categories + policy enum). Two paths: async callback registered via +`WithErrorHandler`, and producer-side typed error via `errors.As` after `Flush` +/ `FlushAndGetSequence`. + +Policy resolution precedence (highest first): `WithErrorPolicyResolver` → +`WithErrorPolicy(category, ...)` → connect-string `on_*_error` → +`on_server_error` → spec defaults. `PROTOCOL_VIOLATION` and `UNKNOWN` are never +user-configurable — always HALT. + +A HALT latches the typed error on the I/O loop; `sendLoopCheckError()` surfaces +it on the next producer call. The sender does not auto-resume — close + rebuild +is the supported recovery (matches Java). ### Connection pooling -`sender_pool.go` provides `LineSenderPool` (`PoolFromConf`, -`NewLineSenderPool`). It is HTTP-only by design — non-HTTP configs -(TCP/TCPS and WS/WSS) are rejected with `errHttpOnlySender`. QWP has -its own in-flight-window concurrency model and does not participate in -the pool. The HTTP transport itself is shared across all -`httpLineSender*` instances via the `globalTransport` singleton, which -closes idle connections when the last sender is released. - -### Value types - -- `decimal.go` — QuestDB's arbitrary-precision `Decimal`, the - `ShopspringDecimal` adapter, and `NewDecimalFromString` / - `NewDecimalFromFloat` constructors. Used by both ILP V3 - (`DecimalColumn*` methods) and QWP (which transmits the fixed-width - Decimal64/128/256 wire forms). -- `ndarray.go` — generic `NdArray[T]` used by `Float64ArrayNDColumn`. - 1D/2D/3D convenience methods wrap it. `MaxArrayElements` (`(1 << 28) - - 1`) caps total element count. QWP additionally supports - `Int64Array{1,2,3}DColumn` via the same columnar buffer machinery. - -## Testing layout - -- `buffer_test.go`, `conf_test.go`, `tcp_sender_test.go`, - `http_sender_test.go`, `sender_pool_test.go`, `ndarray_test.go`, - `qwp_buffer_test.go`, `qwp_encoder_test.go`, `qwp_sender_test.go`, - `qwp_sender_async_test.go`, `qwp_wire_test.go`, - `qwp_errors_test.go`, `qwp_transport_test.go` — pure unit tests, no - Docker required. QWP unit tests use `httptest.Server` to stand in for - the QuestDB WebSocket endpoint (`newQwpTestServer` in - `qwp_sender_test.go`). -- `integration_test.go`, `http_integration_test.go`, - `tcp_integration_test.go`, `qwp_integration_test.go` — boot real - QuestDB via testcontainers-go (HTTP/TCP suites sometimes also launch - haproxy via `test/haproxy.cfg`). These require Docker and pull - images on first run. -- `interop_test.go` + `test/interop/questdb-client-test` (git submodule) - — cross-language ILP conformance vectors shared across all QuestDB - client libraries. -- `qwp_bench_test.go` — `BenchmarkQwpSenderSteadyState` asserts **0 - allocs/op** on the Table→Symbol→Column→At pipeline after warmup. - Preserve this invariant: any new allocation in that hot path should - be moved to a reusable scratch buffer on `qwpLineSender` (see - `encodeInfoBuf`, `pendingSchemaKeysBuf` for the pattern). -- `export_test.go` re-exports unexported identifiers (including - `QwpSenderType`) into the `questdb` package for black-box tests in - package `questdb_test`. When adding internals tests must reach, - extend this file rather than making production code public. The - `Messages` / `MsgCount` / `BufLen` / `ProtocolVersion` helpers switch - across all concrete sender types — keep them exhaustive. +`sender_pool.go` (`LineSenderPool`) is **HTTP-only by design** — TCP/QWP configs +are rejected with `errHttpOnlySender`. QWP has its own concurrency model and +doesn't participate. + +## Testing + +QWP unit tests use `httptest.Server` to stand in for the QuestDB WebSocket +endpoint (`newQwpTestServer` in `qwp_sender_test.go`). ILP unit tests are pure. + +`*_integration_test.go` files need Docker — they spin up real QuestDB via +testcontainers-go; HTTP/TCP suites sometimes launch haproxy via +`test/haproxy.cfg`. + +Cross-language conformance: `interop_test.go` + +`test/interop/questdb-client-test` (submodule) — ILP vectors shared across +QuestDB client libraries. + +`BenchmarkQwpSenderSteadyState` in `qwp_bench_test.go` asserts **0 allocs/op** +on the Table→Symbol→Column→At pipeline after warmup (pinned in +`TestQwpSenderSteadyStateZeroAllocs`). Preserve this: any new allocation in that +hot path moves to a reusable scratch buffer on `qwpLineSender` (see +`encodeInfoBuf` for the pattern). + +`export_test.go` re-exports unexported identifiers (including `QwpSenderType`) +into the `questdb` package for black-box tests in package `questdb_test`. When +adding internals tests must reach, extend this file rather than making +production code public. ## Conventions -- Every `.go` file starts with the QuestDB Apache-2.0 license banner; - preserve it when creating new files. -- Column/table/symbol names have an explicit disallowed-character set - documented on each `LineSender` method. ILP validation lives in - `buffer.go`; QWP validation lives in `qwp_buffer.go`. -- Errors returned from ILP methods are **latched on the buffer** — the - fluent API keeps returning the same sender, and the error surfaces on - the next `At`/`AtNow`/`Flush`. QWP follows the same pattern on its - per-row builder. Preserve this when adding methods. -- QWP schema/symbol cache advancement differs by mode. In sync mode - (`flushSync`), advance `maxSentSchemaId` / `maxSentSymbolId` only - after a successful ACK. In async mode (`flushAsync`, `enqueueFlush`), - advance them immediately after a successful enqueue — the sender is - terminal on I/O error (`asyncState.ioErr` poisons every subsequent - call), so stale cache state cannot reach the wire on a live - connection. Both behaviors match the Java client. +- Every `.go` file starts with the QuestDB Apache-2.0 license banner; preserve + it when creating new files. +- Column/table/symbol name validation: ILP in `buffer.go`, QWP in + `qwp_buffer.go`. The disallowed-character set is documented on each + `LineSender` method. +- **Errors on the fluent API latch** — `Table` / `Symbol` / `*Column` keep + returning the sender; the latched error surfaces on the next `At` / `AtNow` / + `Flush`. Preserve this when adding methods. diff --git a/README.md b/README.md index fd2d8d9d..ce40977f 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,26 @@ Golang client for QuestDB's [Influx Line Protocol](https://questdb.io/docs/refer (ILP) over HTTP and TCP. This library makes it easy to insert data into [QuestDB](https://questdb.io). -The library requires Go 1.19 or newer. +The library requires Go 1.23 or newer. Features: * [Context](https://www.digitalocean.com/community/tutorials/how-to-use-contexts-in-go)-aware API. * Optimized for batch writes. -* Supports TLS encryption and ILP authentication. -* Automatic write retries and connection reuse for ILP over HTTP. -* Tested against QuestDB 7.3.10 and newer versions. +* Three transports: ILP over HTTP and TCP, plus QWP (QuestDB's binary + columnar protocol) over WebSocket. +* Supports TLS encryption and authentication. +* Automatic write retries and connection reuse for ILP over HTTP; + store-and-forward, reconnect, and multi-host failover for QWP. New in v4: -* Supports n-dimensional arrays of doubles for QuestDB servers 9.0.0 and up +* QWP WebSocket transport exposing the full QuestDB type system, with a + typed server-error API and multi-host failover. +* N-dimensional arrays of doubles (QuestDB server 9.0.0 and up). +* Fixed-width decimal columns (QuestDB server 9.2.0 and up). + +ILP over HTTP/TCP is compatible with QuestDB 7.3.10 and newer. The QWP +transport, arrays, and decimals require the newer server versions noted +above. Documentation is available [here](https://pkg.go.dev/github.com/questdb/go-questdb-client/v4). @@ -99,6 +108,272 @@ HTTP is the recommended transport to use. To connect via TCP, set the configurat // ... ``` +## QuestDB Wire Protocol (QWP) over WebSocket + +QWP is QuestDB's binary *columnar* wire protocol. Compared to ILP, it +offers higher throughput for wide rows and exposes the full QuestDB type +system — including `byte`, `short`, `int`, `float`, `char`, `date`, +nanosecond timestamps, `uuid`, `geohash`, `int64` arrays, and +fixed-width decimals. + +Switch the Quickstart to QWP by changing the schema to `ws` (plain) or +`wss` (TLS): + +```go +sender, err := qdb.LineSenderFromConf(ctx, "ws::addr=localhost:9000;") +``` + +The full fluent API shown in the Quickstart (`Table`, `Symbol`, +`Float64Column`, `Int64Column`, `At`, `AtNow`, `Flush`, `Close`) works +unchanged, as do the array and decimal methods shown below. QWP is a +distinct binary protocol rather than a version of ILP, so the +`protocol_version` configuration key does not apply. + +### QWP-only column types + +To access types that ILP does not expose, type-assert the sender to +`qdb.QwpSender`: + +```go +sender, err := qdb.LineSenderFromConf(ctx, "ws::addr=localhost:9000;") +if err != nil { + log.Fatal(err) +} +defer sender.Close(ctx) +qwp := sender.(qdb.QwpSender) + +err = qwp. + Table("sensors"). + Symbol("site", "roof"). + ByteColumn("status_code", 3). + ShortColumn("battery", 4812). + Int32Column("sample_count", 120_000). + Float32Column("temperature", 21.7). + CharColumn("grade", 'A'). + DateColumn("calibrated", time.Now()). + TimestampNanosColumn("captured", time.Now()). + UuidColumn("device_id", 0x0123456789abcdef, 0xfedcba9876543210). + GeohashColumn("location", 0x1fb9, 15). + Int64Array1DColumn("raw_counts", []int64{10, 20, 30}). + Decimal64Column("voltage", qdb.NewDecimalFromInt64(12345, 4)). + AtNano(ctx, time.Now()) +``` + +`QwpSender` adds: `ByteColumn`, `ShortColumn`, `Int32Column`, +`Float32Column`, `CharColumn`, `DateColumn`, `TimestampNanosColumn`, +`UuidColumn`, `GeohashColumn`, `Int64Array1DColumn`, +`Int64Array2DColumn`, `Int64Array3DColumn`, `Decimal64Column`, +`Decimal128Column`, `Decimal256Column`, and `AtNano` (nanosecond- +resolution designated timestamp; `At` uses microseconds). + +### Flush semantics and backpressure + +The QWP sender always pipelines encoding with transmission: a dedicated +I/O goroutine drains a cursor engine to the WebSocket and owns reconnect +and replay. You do not configure a pipeline depth — backpressure is +governed by the engine's segment ring and the append deadline +(`sf_append_deadline_millis` in store-and-forward mode), not by a +fixed in-flight count. + +`in_flight_window` / `qdb.WithInFlightWindow(n)` is **retained for +backward compatibility but is a no-op** in this architecture. Connect +strings carrying it still parse; the value is ignored. + +`Flush` and `FlushAndGetSequence` **never wait for the server ACK**. +They return once the batch is published into the cursor engine — in +RAM for memory mode, on disk for store-and-forward — after which the +I/O goroutine delivers and replays it in the background. A returned +`Flush` therefore means the batch is durably *published*, not that the +server has confirmed it: in memory mode, a process exit before the +background send completes can still lose unacked rows. Auto-flush +(triggered by row/byte/interval thresholds) follows the same +publish-only path. For server-ACK confirmation, `FlushAndGetSequence` +returns the published FSN (the upper bound of any `SenderError.ToFsn` +for that batch); pair it with `AwaitAckedFsn` to wait for the server +to confirm that FSN. + +### Authentication + +Basic auth and bearer tokens work the same way as for HTTP: + +```go +qdb.LineSenderFromConf(ctx, "wss::addr=host:9000;username=admin;password=secret;") +qdb.LineSenderFromConf(ctx, "wss::addr=host:9000;token=;") +``` + +`LineSenderPool` is HTTP-only and cannot be used with QWP — QWP's +cursor engine already pipelines transmission from a single sender. + +### Error handling + +When the server rejects a published QWP batch, the rejection surfaces +as a `*qdb.SenderError` carrying a stable `Category` +(`SCHEMA_MISMATCH`, `PARSE_ERROR`, `INTERNAL_ERROR`, `SECURITY_ERROR`, +`WRITE_ERROR`, `PROTOCOL_VIOLATION`, `UNKNOWN`), the server message, +and the `[FromFsn, ToFsn]` span — join that span against the value +returned by `FlushAndGetSequence` to identify exactly which rows were +rejected. + +There are two delivery paths, both carrying the same payload: + +```go +sender, err := qdb.NewLineSender(ctx, + qdb.WithQwp(), + qdb.WithAddress("localhost:9000"), + // Async: dead-letter channel for DROP_AND_CONTINUE batches. + qdb.WithErrorHandler(func(e *qdb.SenderError) { + log.Printf("rejected fsn=[%d,%d] %s: %s", + e.FromFsn, e.ToFsn, e.Category, e.ServerMessage) + }), +) +// ... + +// Sync: after a HALT, the typed error surfaces on the next +// producer-thread call (At / AtNow / Flush). +if err := sender.Flush(ctx); err != nil { + var se *qdb.SenderError + if errors.As(err, &se) { + // inspect se.Category, se.ServerMessage, se.FromFsn, ... + } +} +``` + +Each `Category` resolves to a `Policy` — `HALT` (latch the error; +the sender does not drain further until you close and rebuild it) or +`DROP_AND_CONTINUE` (drop the rejected span from the store and keep +going; recover the data via the async handler). Resolution precedence, +highest first: `WithErrorPolicyResolver` → `WithErrorPolicy(category, +policy)` → connect-string `on__error` → connect-string +`on_server_error` → spec defaults. `PROTOCOL_VIOLATION` and `UNKNOWN` +are always `HALT` and cannot be overridden. + +The connect-string equivalents take `halt` / `drop` (and `auto` for +the global key): + +```go +qdb.LineSenderFromConf(ctx, + "ws::addr=localhost:9000;"+ + "on_server_error=halt;"+ // global default + "on_schema_error=drop;"+ // per-category override + "on_write_error=drop;") +``` + +Per-category keys are `on_schema_error`, `on_parse_error`, +`on_internal_error`, `on_security_error`, and `on_write_error`. + +### Multi-host failover + +`addr` accepts a comma-separated list (or repeated `addr=` keys) for +transparent failover. The client walks the list in priority order on +connect and reconnect; it does not shuffle or load-balance — that is +the server-side coordinator's job. + +```go +qdb.LineSenderFromConf(ctx, + "ws::addr=node-a:9000,node-b:9000,node-c:9000;") +``` + +`target` constrains which endpoints are acceptable by replicated-cluster +role: `any` (default), `primary` (writers only — also accepts +standalone OSS servers), or `replica`. `zone` is an opaque, +case-insensitive locality identifier (e.g. `eu-west-1a`); when set, the +client prefers same-zone endpoints. Both `target` and `zone` are +effective on the query side; for ingestion they are silently accepted +but have no effect — the ingestion path does not route by server role +or zone (role/zone-aware endpoint selection is a query-side feature). + +```go +qdb.LineSenderFromConf(ctx, + "ws::addr=node-a:9000,node-b:9000;target=primary;zone=eu-west-1a;") +``` + +The reconnect budget and backoff that govern how long failover persists +through an outage are the `reconnect_*` and `initial_connect_retry` +knobs documented under [QWP store-and-forward](#qwp-store-and-forward-sf) +— they apply whether or not `sf_dir` is set. + +### Querying with `QwpQueryClient` + +QWP also supports the query side: streaming columnar result batches +from the server back to the client over the same WebSocket protocol. +Use `QwpQueryClient` to run SELECT and DML statements: + +```go +client, err := qdb.NewQwpQueryClient(ctx, + qdb.WithQwpQueryAddress("localhost:9000"), +) +if err != nil { + log.Fatal(err) +} +defer client.Close(ctx) + +// Non-SELECT statements use Exec. +if _, err := client.Exec(ctx, + "CREATE TABLE example (ts TIMESTAMP, v LONG) TIMESTAMP(ts)"); err != nil { + log.Fatal(err) +} + +// SELECT returns a *QwpQuery; range over its Batches iterator. +q := client.Query(ctx, "SELECT ts, v FROM example") +defer q.Close() + +var sum int64 +for batch, err := range q.Batches() { + if err != nil { + log.Fatal(err) + } + vCol := batch.Column(1) // column 1 is `v` (LONG) + for r := 0; r < vCol.RowCount(); r++ { + sum += vCol.Int64(r) + } +} +``` + +For tight column sweeps you can decode a row range into a caller-owned +slice in one shot. On a no-null column this lowers to a single +`memmove`, after which the inner loop is branch-free and vectorizable: + +```go +buf := make([]int64, 0, 1024) +for batch, err := range q.Batches() { + if err != nil { + log.Fatal(err) + } + buf = batch.Column(1).Int64Range(0, batch.RowCount(), buf[:0]) + for _, v := range buf { + sum += v + } +} +``` + +Bind parameters are passed via `qdb.WithQwpQueryBinds` and use `$1`, `$2`, +... placeholders. Setters take 0-based indexes and must be called in +ascending order: + +```go +q := client.Query(ctx, + "SELECT ts, v FROM example WHERE v > $1", + qdb.WithQwpQueryBinds(func(b *qdb.QwpBinds) { + b.LongBind(0, 100) + }), +) +``` + +Configuration via a config string is also supported: + +```go +client, err := qdb.QwpQueryClientFromConf(ctx, + "ws::addr=localhost:9000;username=admin;password=secret;") +``` + +`QwpQueryClient` is **not** safe for concurrent `Query` or `Exec` calls — +open one client per query-issuing goroutine. `Cancel` (on `*QwpQuery`) +and `Close` (on the client) are safe to call from any goroutine, +including from within an in-flight iterator. + +A complete runnable example is at +[`examples/qwp/basic-query/main.go`](examples/qwp/basic-query/main.go). + ## N-dimensional arrays QuestDB server version 9.0.0 and newer supports n-dimensional arrays of double precision floating point numbers. @@ -272,6 +547,74 @@ func main() { } ``` +## QWP store-and-forward (SF) + +QuestDB's WebSocket transport (`ws::` / `wss::`, see Java client docs) +supports an opt-in **store-and-forward** mode: outgoing batches are +persisted to mmap'd disk segments before they leave the wire, and the +I/O loop replays from disk on transient disconnects or process +restarts. User code does not see brief outages; an unrecoverable +failure surfaces on the next `At` / `AtNow` / `Flush` call. + +Activate SF by setting `sf_dir` (the parent directory under which the +sender's slot is created) on a `ws::` / `wss::` connection string: + +```go +sender, err := qdb.LineSenderFromConf(ctx, + "ws::addr=localhost:9000;"+ + "sf_dir=/var/lib/questdb-sf;"+ + "sender_id=my-app;"+ + "close_flush_timeout_millis=5000;") +``` + +The slot lives at `//`. An advisory exclusive +`flock` on `/.lock` prevents two senders from sharing a slot; +the lock releases automatically when the process exits. + +### Connect-string knobs (QWP only) + +| Key | Default | Effect | +|---|---|---| +| `sf_dir` | unset | Group root. Setting it activates SF. | +| `sender_id` | `default` | Per-sender slot name; ASCII letters / digits / `-_.` only. | +| `sf_max_bytes` | 4 MiB | Per-segment file size. | +| `sf_max_total_bytes` | 10 GiB | Total cap; producer is backpressured when reached. | +| `sf_durability` | `memory` | Reserved; `flush` / `append` are deferred follow-ups. | +| `sf_append_deadline_millis` | 30000 | How long `At` / `AtNow` block on backpressure before failing. | +| `reconnect_max_duration_millis` | 300000 | Per-outage cap on reconnect retries. | +| `reconnect_initial_backoff_millis` | 100 | Initial backoff with jitter. | +| `reconnect_max_backoff_millis` | 5000 | Backoff cap. | +| `initial_connect_retry` | `off` | `off`/`false` = terminal on first failure; `on`/`true`/`sync` = same retry loop as reconnect, blocking the constructor; `async` = same retry loop on the I/O goroutine, constructor returns immediately and producers experience backpressure until the wire comes up. | +| `close_flush_timeout_millis` | 5000 | `Close` waits this long for ACKs; `0` / `-1` skips the drain. | +| `drain_orphans` | `off` | When `on`, scan `/*` and adopt sibling slots that hold unacked data. | +| `max_background_drainers` | 4 | Cap on concurrent orphan drainers. | + +The same options are available programmatically: +`WithSfDir`, `WithSenderId`, `WithSfMaxBytes`, `WithSfMaxTotalBytes`, +`WithReconnectPolicy`, `WithInitialConnectRetry`, +`WithInitialConnectMode`, `WithCloseFlushTimeout`. + +### Failure semantics + +- **Transient disconnect**: caught by the I/O loop, transparent to user code. +- **Auth rejection (HTTP 401/403)** on connect or reconnect: terminal — surfaced on the next user-thread call. +- **Server rejected a frame** (e.g. schema mismatch): terminal; replay would just rebound, so the loop stops and reports the rejection. Bytes stay on disk for inspection. +- **Reconnect cap exhausted**: terminal; restart the process to resume from disk. +- **Disk cap full**: `At` / `AtNow` block up to `sf_append_deadline_millis`, then fail with a "wire path is not draining" error. + +### Crash recovery + +On startup with the same `sf_dir` + `sender_id`, the sender opens +existing segment files, validates per-frame CRC32C, recovers any torn +tail at the active segment's last good frame, and resumes sending +where the prior session left off. + +If a previous sender process crashed and left its slot dir behind, +turning on `drain_orphans=on` will scan sibling slots under `sf_dir` +and adopt them on a separate connection: the foreground sender is +unaffected, and a `.failed` sentinel is dropped if a drainer can't +make progress (auth rejection, exhausted reconnect cap, etc.). + ## Community If you need help, have additional questions or want to provide feedback, you diff --git a/bench/qwp-egress-read-wide/go.mod b/bench/qwp-egress-read-wide/go.mod new file mode 100644 index 00000000..5a8da9ff --- /dev/null +++ b/bench/qwp-egress-read-wide/go.mod @@ -0,0 +1,20 @@ +module github.com/questdb/go-questdb-client/v4/bench/qwp-egress-read-wide + +go 1.23 + +require ( + github.com/jackc/pgx/v5 v5.7.1 + github.com/questdb/go-questdb-client/v4 v4.0.0 +) + +require ( + github.com/coder/websocket v1.8.14 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/klauspost/compress v1.18.4 // indirect + golang.org/x/crypto v0.27.0 // indirect + golang.org/x/sys v0.25.0 // indirect + golang.org/x/text v0.18.0 // indirect +) + +replace github.com/questdb/go-questdb-client/v4 => ../.. diff --git a/bench/qwp-egress-read-wide/go.sum b/bench/qwp-egress-read-wide/go.sum new file mode 100644 index 00000000..da3cc2e8 --- /dev/null +++ b/bench/qwp-egress-read-wide/go.sum @@ -0,0 +1,114 @@ +dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= +dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= +github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= +github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= +github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0= +github.com/containerd/containerd v1.7.12/go.mod h1:/5OMpE1p0ylxtEUGY8kuCYkDRzJm9NO1TFMWjUpdevk= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= +github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= +github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= +github.com/docker/docker v24.0.9+incompatible h1:HPGzNmwfLZWdxHqK9/II92pyi1EpYKsAqcl4G0Of9v0= +github.com/docker/docker v24.0.9+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= +github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs= +github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a h1:N9zuLhTvBSRt0gWSiJswwQ2HqDmtX/ZCDJURnKUt1Ik= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a/go.mod h1:JKx41uQRwqlTZabZc+kILPrO/3jlKnQ2Z8b7YiVw5cE= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= +github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= +github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= +github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= +github.com/opencontainers/runc v1.1.5/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b h1:0LFwY6Q3gMACTjAbMZBjXAqTOzOwFaj2Ld6cjeQ7Rig= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= +github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c= +github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw= +github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97 h1:6GQBEOdGkX6MMTLT9V+TjtIRZCw9VPD5Z+yHY9wMgS0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97/go.mod h1:v7nGkzlmW8P3n/bKmWBn2WpBjpOEx8Q6gMueudAmKfY= +google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= +google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/bench/qwp-egress-read-wide/main.go b/bench/qwp-egress-read-wide/main.go new file mode 100644 index 00000000..87525cf3 --- /dev/null +++ b/bench/qwp-egress-read-wide/main.go @@ -0,0 +1,518 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +// Wide variant of the QWP egress benchmark. Compares SELECT throughput +// from a locally running QuestDB instance over three wire protocols on +// a 15-column row: +// +// - QWP egress (WebSocket, binary columnar) +// - PostgreSQL wire (binary transfer) +// - HTTP /exec (JSON) +// +// Schema: designated TIMESTAMP, one LONG, one DOUBLE, six SYMBOLs (one +// low-cardinality with 8 distinct values, five high-cardinality with +// 100k distinct values each), one VARCHAR, and five additional DOUBLEs. +// Mirrors QwpEgressReadBenchmarkWide.java in benchmarks/. +// +// Prerequisites: +// - A QuestDB server listening on 9000 (HTTP/WS) and 8812 (PG wire). +// +// Tune the workload via flags: +// +// -rows N row count to ingest (default 10_000_000) +// -skip-populate re-use the existing table (default false) +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "io" + "log" + "math" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/jackc/pgx/v5" + qdb "github.com/questdb/go-questdb-client/v4" +) + +const ( + host = "localhost" + httpPort = 9000 + pgPort = 8812 + progressEvery = 1_000_000 + tableName = "egress_bench_wide" + // highCard is the distinct value count for each of s1..s5. Sized + // large enough to stress the SYMBOL dict path: 100k unique values + // per column means the connection-scoped delta dict grows for most + // of the batch sequence rather than settling into a cached state. + highCard = 100_000 +) + +var ( + rowCount int64 + skipPopulate bool +) + +type result struct { + elapsed time.Duration + rows int64 + bytes int64 +} + +func main() { + flag.Int64Var(&rowCount, "rows", 10_000_000, "row count") + flag.BoolVar(&skipPopulate, "skip-populate", false, "skip table create + ingest, re-use existing data") + flag.Parse() + + ctx := context.Background() + + if !skipPopulate { + mustOK(recreateTable(ctx)) + mustOK(ingestRows(ctx)) + } else { + fmt.Printf("skip-populate=true, re-using existing %s\n", tableName) + } + + fmt.Println() + fmt.Println("=== Cold warm-up (runs discarded) ===") + if _, err := runQwp(ctx, true); err != nil { + log.Fatalf("QWP warmup: %v", err) + } + if _, err := runPgWire(ctx, true); err != nil { + log.Fatalf("PG warmup: %v", err) + } + if _, err := runHTTPExec(ctx, true); err != nil { + log.Fatalf("HTTP warmup: %v", err) + } + + fmt.Println() + fmt.Println("=== Measurement ===") + qwp, err := runQwp(ctx, false) + if err != nil { + log.Fatalf("QWP: %v", err) + } + pg, err := runPgWire(ctx, false) + if err != nil { + log.Fatalf("PG: %v", err) + } + httpRes, err := runHTTPExec(ctx, false) + if err != nil { + log.Fatalf("HTTP: %v", err) + } + + fmt.Println() + fmt.Println("=== Comparison ===") + fmt.Printf("%-20s %12s %12s %12s\n", "Protocol", "time(ms)", "rows/sec", "MiB/sec") + fmt.Printf("%-20s %12s %12s %12s\n", "--------", "--------", "--------", "-------") + printRow("QWP egress (WS)", qwp) + printRow("PostgreSQL wire", pg) + printRow("HTTP /exec JSON", httpRes) +} + +func mustOK(err error) { + if err != nil { + log.Fatal(err) + } +} + +func printRow(label string, r result) { + secs := r.elapsed.Seconds() + rowsPerSec := float64(r.rows) / secs + mibPerSec := float64(r.bytes) / secs / (1024.0 * 1024.0) + fmt.Printf("%-20s %12d %12.0f %12.2f\n", + label, r.elapsed.Milliseconds(), rowsPerSec, mibPerSec) +} + +// ------------------------------------------------------------------ +// Workload +// ------------------------------------------------------------------ + +func pgConnString() string { + return fmt.Sprintf("postgres://admin:quest@%s:%d/qdb?sslmode=disable", host, pgPort) +} + +// selectColumns is the comma-separated SELECT list shared by every +// runner. Kept in one place so adding/removing a column needs a single +// edit, and the QWP column-index mapping in runQwp stays trivially +// auditable against this list. +const selectColumns = "ts, id, price, sym, note," + + " d1, d2, d3, d4, d5," + + " s1, s2, s3, s4, s5" + +func recreateTable(ctx context.Context) error { + c, err := qdb.NewQwpQueryClient(ctx, qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort))) + if err != nil { + return fmt.Errorf("recreateTable: connect: %w", err) + } + defer c.Close(ctx) + + if _, err := c.Exec(ctx, "DROP TABLE IF EXISTS '"+tableName+"'"); err != nil { + return fmt.Errorf("recreateTable: drop: %w", err) + } + // Wide schema: low-cardinality sym + five high-cardinality SYMBOLs + // (capacity 200000 to fit the 100k distinct values per column + // with comfortable slack) + five extra DOUBLEs. Representative of + // a realistic analytics row with mixed numerics and several + // categorical dimensions of differing cardinality. + createSQL := "CREATE TABLE '" + tableName + "' (" + + "ts TIMESTAMP, id LONG, price DOUBLE, sym SYMBOL, note VARCHAR," + + " d1 DOUBLE, d2 DOUBLE, d3 DOUBLE, d4 DOUBLE, d5 DOUBLE," + + " s1 SYMBOL capacity 200000, s2 SYMBOL capacity 200000," + + " s3 SYMBOL capacity 200000, s4 SYMBOL capacity 200000," + + " s5 SYMBOL capacity 200000" + + ") TIMESTAMP(ts) PARTITION BY HOUR WAL" + if _, err := c.Exec(ctx, createSQL); err != nil { + return fmt.Errorf("recreateTable: create: %w", err) + } + return nil +} + +func ingestRows(ctx context.Context) error { + fmt.Printf("Ingesting %d rows over QWP/WebSocket...\n", rowCount) + start := time.Now() + symbols := []string{"AAPL", "MSFT", "GOOG", "AMZN", "META", "TSLA", "NVDA", "NFLX"} + // Pre-generate the 100k unique values per high-cardinality column + // so the ingest loop reuses interned strings instead of allocating + // fresh ones per row. Rotating s1..s5 through different offsets + // makes any cross-column correlation coincidental. + s1Pool := buildSymbolPool("s1_") + s2Pool := buildSymbolPool("s2_") + s3Pool := buildSymbolPool("s3_") + s4Pool := buildSymbolPool("s4_") + s5Pool := buildSymbolPool("s5_") + + // auto_flush_rows sized so each ILP frame stays under the server's + // 2 MiB WebSocket buffer given the 15-column row layout (~130 + // bytes/row encoded). + conf := fmt.Sprintf("ws::addr=%s:%d;auto_flush_rows=10000;", host, httpPort) + sender, err := qdb.LineSenderFromConf(ctx, conf) + if err != nil { + return fmt.Errorf("ingest: open sender: %w", err) + } + defer sender.Close(ctx) + + for i := int64(1); i <= rowCount; i++ { + h1 := i % highCard + h2 := (i + 20_000) % highCard + h3 := (i + 40_000) % highCard + h4 := (i + 60_000) % highCard + h5 := (i + 80_000) % highCard + // ILP requires all Symbol calls before any non-symbol column setters. + if err := sender.Table(tableName). + Symbol("sym", symbols[i%int64(len(symbols))]). + Symbol("s1", s1Pool[h1]). + Symbol("s2", s2Pool[h2]). + Symbol("s3", s3Pool[h3]). + Symbol("s4", s4Pool[h4]). + Symbol("s5", s5Pool[h5]). + Int64Column("id", i). + Float64Column("price", float64(i)*1.5). + Float64Column("d1", float64(i)*0.25). + Float64Column("d2", float64(i)*0.5). + Float64Column("d3", float64(i)*0.75). + Float64Column("d4", float64(i)*1.25). + Float64Column("d5", float64(i)*1.75). + StringColumn("note", "n"+strconv.FormatInt(i&0xFFF, 10)). + At(ctx, time.UnixMicro(i*10_000)); err != nil { + return fmt.Errorf("ingest: At row %d: %w", i, err) + } + if i%progressEvery == 0 { + fmt.Printf(" %d / %d rows (%d ms)\n", i, rowCount, time.Since(start).Milliseconds()) + } + } + if err := sender.Flush(ctx); err != nil { + return fmt.Errorf("ingest: flush: %w", err) + } + + fmt.Println("Waiting for WAL apply to complete...") + return waitForWalApply(ctx) +} + +func buildSymbolPool(prefix string) []string { + pool := make([]string, highCard) + for i := 0; i < highCard; i++ { + pool[i] = prefix + strconv.Itoa(i) + } + return pool +} + +func waitForWalApply(ctx context.Context) error { + c, err := qdb.NewQwpQueryClient(ctx, qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort))) + if err != nil { + return fmt.Errorf("wait: connect: %w", err) + } + defer c.Close(ctx) + + deadline := time.Now().Add(5 * time.Minute) + for time.Now().Before(deadline) { + count, err := selectCount(ctx, c) + if err != nil { + return fmt.Errorf("wait: count: %w", err) + } + if count == rowCount { + fmt.Printf(" applied %d rows\n", count) + return nil + } + time.Sleep(500 * time.Millisecond) + } + return errors.New("timed out waiting for WAL apply") +} + +func selectCount(ctx context.Context, c *qdb.QwpQueryClient) (int64, error) { + q := c.Query(ctx, "SELECT count() FROM "+tableName) + defer q.Close() + var count int64 + for batch, err := range q.Batches() { + if err != nil { + return 0, err + } + if batch.RowCount() > 0 { + count = batch.Column(0).Int64(0) + } + } + return count, nil +} + +// ------------------------------------------------------------------ +// QWP egress +// ------------------------------------------------------------------ + +func runQwp(ctx context.Context, warmup bool) (result, error) { + var rowsSeen, bytesSeen, checksum int64 + start := time.Now() + + c, err := qdb.NewQwpQueryClient(ctx, + qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort)), + qdb.WithQwpQueryClientID("qwp-egress-bench-wide/1.0"), + qdb.WithQwpQueryCompression("raw"), + ) + if err != nil { + return result{}, err + } + defer c.Close(ctx) + + q := c.Query(ctx, "SELECT "+selectColumns+" FROM "+tableName) + defer q.Close() + for batch, err := range q.Batches() { + if err != nil { + return result{}, err + } + n := batch.RowCount() + // Cache the per-column handles once per batch so each cell + // access skips re-deriving the layout pointer — same idiom as + // the Java path that grabs valuesAddr / nonNullIndex up front. + // Read 8-byte fixed-width columns (ts, id, price, d1..d5) as + // raw int64 bits and XOR them straight in; matches Java's + // Unsafe.getLong on the DOUBLE column bases. + tsCol := batch.Column(0) + idCol := batch.Column(1) + priceCol := batch.Column(2) + symCol := batch.Column(3) + noteCol := batch.Column(4) + d1Col := batch.Column(5) + d2Col := batch.Column(6) + d3Col := batch.Column(7) + d4Col := batch.Column(8) + d5Col := batch.Column(9) + s1Col := batch.Column(10) + s2Col := batch.Column(11) + s3Col := batch.Column(12) + s4Col := batch.Column(13) + s5Col := batch.Column(14) + for r := 0; r < n; r++ { + ts := tsCol.Int64(r) + id := idCol.Int64(r) + priceBits := priceCol.Int64(r) + d1 := d1Col.Int64(r) + d2 := d2Col.Int64(r) + d3 := d3Col.Int64(r) + d4 := d4Col.Int64(r) + d5 := d5Col.Int64(r) + sym := symCol.Str(r) + note := noteCol.Str(r) + s1 := s1Col.Str(r) + s2 := s2Col.Str(r) + s3 := s3Col.Str(r) + s4 := s4Col.Str(r) + s5 := s5Col.Str(r) + checksum ^= ts ^ id ^ priceBits ^ d1 ^ d2 ^ d3 ^ d4 ^ d5 ^ + int64(len(sym)) ^ int64(len(note)) ^ + int64(len(s1)) ^ int64(len(s2)) ^ int64(len(s3)) ^ + int64(len(s4)) ^ int64(len(s5)) + } + rowsSeen += int64(n) + // Sum the actual QWP message bytes delivered in this frame + // plus a 10-byte WebSocket-header approximation, matching the + // calculation in the Java benchmark, so the bytes/sec column + // is comparable. + bytesSeen += int64(len(batch.Payload())) + 10 + } + elapsed := time.Since(start) + logRun("QWP", warmup, elapsed, rowsSeen, fmt.Sprintf("0x%x", uint64(checksum))) + return result{elapsed: elapsed, rows: rowsSeen, bytes: bytesSeen}, nil +} + +// ------------------------------------------------------------------ +// PostgreSQL wire +// ------------------------------------------------------------------ + +func runPgWire(ctx context.Context, warmup bool) (result, error) { + var rows, checksum, bytes int64 + start := time.Now() + + cfg, err := pgx.ParseConfig(pgConnString()) + if err != nil { + return result{}, err + } + conn, err := pgx.ConnectConfig(ctx, cfg) + if err != nil { + return result{}, err + } + defer conn.Close(ctx) + + qrows, err := conn.Query(ctx, "SELECT "+selectColumns+" FROM "+tableName) + if err != nil { + return result{}, err + } + defer qrows.Close() + + for qrows.Next() { + var ts time.Time + var id int64 + var price, d1, d2, d3, d4, d5 float64 + var sym, note, s1, s2, s3, s4, s5 string + if err := qrows.Scan( + &ts, &id, &price, &sym, ¬e, + &d1, &d2, &d3, &d4, &d5, + &s1, &s2, &s3, &s4, &s5, + ); err != nil { + return result{}, err + } + // Normalise to epoch microseconds so the checksum matches the + // QWP path. Java's getTimestamp().getTime()*1000 truncates to + // ms*1000; QuestDB's micros are 10ms-aligned in this dataset + // so both forms agree. + tsMicros := ts.UnixMicro() + checksum ^= tsMicros ^ id ^ + int64(math.Float64bits(price)) ^ + int64(math.Float64bits(d1)) ^ int64(math.Float64bits(d2)) ^ + int64(math.Float64bits(d3)) ^ int64(math.Float64bits(d4)) ^ + int64(math.Float64bits(d5)) ^ + int64(len(sym)) ^ int64(len(note)) ^ + int64(len(s1)) ^ int64(len(s2)) ^ int64(len(s3)) ^ + int64(len(s4)) ^ int64(len(s5)) + // PG DataRow wire size per row in binary mode: 1 byte 'D' msg + // tag, 4 bytes msg length, 2 bytes col count, then a 4-byte + // length prefix + value for each of the 15 columns. 8 fixed- + // width 8-byte cols (ts, id, price, d1..d5), 7 variable-length + // cols (sym, note, s1..s5). + bytes += 7 + 15*4 + 8*8 + + int64(len(sym)) + int64(len(note)) + + int64(len(s1)) + int64(len(s2)) + int64(len(s3)) + + int64(len(s4)) + int64(len(s5)) + rows++ + } + if err := qrows.Err(); err != nil { + return result{}, err + } + elapsed := time.Since(start) + logRun("PG", warmup, elapsed, rows, fmt.Sprintf("0x%x", uint64(checksum))) + return result{elapsed: elapsed, rows: rows, bytes: bytes}, nil +} + +// ------------------------------------------------------------------ +// HTTP /exec JSON +// ------------------------------------------------------------------ + +func runHTTPExec(ctx context.Context, warmup bool) (result, error) { + var bytes int64 + start := time.Now() + + sql := "SELECT " + selectColumns + " FROM " + tableName + u := fmt.Sprintf("http://%s:%d/exec?query=%s&count=true", + host, httpPort, url.QueryEscape(sql)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return result{}, err + } + req.Header.Set("Accept-Encoding", "identity") + resp, err := http.DefaultClient.Do(req) + if err != nil { + return result{}, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return result{}, fmt.Errorf("HTTP /exec: status %s", resp.Status) + } + + // JSON response is one line with {"columns":[...],"dataset":[[...],...]}. + // Scan for '[' to count rows — same approximation as the Java path. + var brackets int64 + buf := make([]byte, 16*1024) + for { + n, err := resp.Body.Read(buf) + if n > 0 { + bytes += int64(n) + for i := 0; i < n; i++ { + if buf[i] == '[' { + brackets++ + } + } + } + if err == io.EOF { + break + } + if err != nil { + return result{}, err + } + } + // Brackets counter incremented for every '[' including the outer + // "columns" wrapper and the "dataset" wrapper; subtract those two. + var rows int64 + if brackets > 1 { + rows = brackets - 2 + } + elapsed := time.Since(start) + logRun("HTTP", warmup, elapsed, rows, strconv.FormatInt(bytes, 10)) + return result{elapsed: elapsed, rows: rows, bytes: bytes}, nil +} + +// ------------------------------------------------------------------ +// Helpers +// ------------------------------------------------------------------ + +func logRun(label string, warmup bool, elapsed time.Duration, rows int64, suffix string) { + phase := "[measure]" + if warmup { + phase = "[warmup]" + } + fmt.Printf("%s %s : %d rows in %d ms (checksum/bytes=%s)\n", + phase, label, rows, elapsed.Milliseconds(), suffix) +} + diff --git a/bench/qwp-egress-read/go.mod b/bench/qwp-egress-read/go.mod new file mode 100644 index 00000000..be13c89d --- /dev/null +++ b/bench/qwp-egress-read/go.mod @@ -0,0 +1,20 @@ +module github.com/questdb/go-questdb-client/v4/bench/qwp-egress-read + +go 1.23 + +require ( + github.com/jackc/pgx/v5 v5.7.1 + github.com/questdb/go-questdb-client/v4 v4.0.0 +) + +require ( + github.com/coder/websocket v1.8.14 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/klauspost/compress v1.18.4 // indirect + golang.org/x/crypto v0.27.0 // indirect + golang.org/x/sys v0.25.0 // indirect + golang.org/x/text v0.18.0 // indirect +) + +replace github.com/questdb/go-questdb-client/v4 => ../.. diff --git a/bench/qwp-egress-read/go.sum b/bench/qwp-egress-read/go.sum new file mode 100644 index 00000000..da3cc2e8 --- /dev/null +++ b/bench/qwp-egress-read/go.sum @@ -0,0 +1,114 @@ +dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= +dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= +github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= +github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= +github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0= +github.com/containerd/containerd v1.7.12/go.mod h1:/5OMpE1p0ylxtEUGY8kuCYkDRzJm9NO1TFMWjUpdevk= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= +github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= +github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= +github.com/docker/docker v24.0.9+incompatible h1:HPGzNmwfLZWdxHqK9/II92pyi1EpYKsAqcl4G0Of9v0= +github.com/docker/docker v24.0.9+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= +github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs= +github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a h1:N9zuLhTvBSRt0gWSiJswwQ2HqDmtX/ZCDJURnKUt1Ik= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a/go.mod h1:JKx41uQRwqlTZabZc+kILPrO/3jlKnQ2Z8b7YiVw5cE= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= +github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= +github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= +github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= +github.com/opencontainers/runc v1.1.5/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b h1:0LFwY6Q3gMACTjAbMZBjXAqTOzOwFaj2Ld6cjeQ7Rig= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= +github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c= +github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw= +github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97 h1:6GQBEOdGkX6MMTLT9V+TjtIRZCw9VPD5Z+yHY9wMgS0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97/go.mod h1:v7nGkzlmW8P3n/bKmWBn2WpBjpOEx8Q6gMueudAmKfY= +google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= +google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/bench/qwp-egress-read/main.go b/bench/qwp-egress-read/main.go new file mode 100644 index 00000000..ba9c2eaa --- /dev/null +++ b/bench/qwp-egress-read/main.go @@ -0,0 +1,427 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +// Application-style benchmark that measures SELECT throughput from a +// locally running QuestDB instance over three wire protocols and prints +// a comparison: +// +// - QWP egress (WebSocket, binary columnar) +// - PostgreSQL wire (binary transfer) +// - HTTP /exec (JSON) +// +// Narrow variant: five columns (designated timestamp, one LONG, one +// DOUBLE, one low-cardinality SYMBOL, one VARCHAR). Mirrors the Java +// QwpEgressReadBenchmark.java in benchmarks/. +// +// Prerequisites: +// - A QuestDB server listening on 9000 (HTTP/WS) and 8812 (PG wire). +// +// Tune the workload via flags: +// -rows N row count to ingest (default 10_000_000) +// -skip-populate re-use the existing table (default false) +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "io" + "log" + "math" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/jackc/pgx/v5" + qdb "github.com/questdb/go-questdb-client/v4" +) + +const ( + host = "localhost" + httpPort = 9000 + pgPort = 8812 + progressEvery = 1_000_000 + tableName = "egress_bench" +) + +var ( + rowCount int64 + skipPopulate bool +) + +type result struct { + elapsed time.Duration + rows int64 + bytes int64 +} + +func main() { + flag.Int64Var(&rowCount, "rows", 10_000_000, "row count") + flag.BoolVar(&skipPopulate, "skip-populate", false, "skip table create + ingest, re-use existing data") + flag.Parse() + + ctx := context.Background() + + if !skipPopulate { + mustOK(recreateTable(ctx)) + mustOK(ingestRows(ctx)) + } else { + fmt.Printf("skip-populate=true, re-using existing %s\n", tableName) + } + + fmt.Println() + fmt.Println("=== Cold warm-up (runs discarded) ===") + if _, err := runQwp(ctx, true); err != nil { + log.Fatalf("QWP warmup: %v", err) + } + // Java has these commented out — the JVM JIT warmup for QWP is + // the only thing that matters in the original. The Go runtime + // has no JIT to warm, but warming the server-side buffer cache + // and TCP windows is still useful, so leave the calls available + // for callers who want symmetrical warmups. + // + // if _, err := runPgWire(ctx, true); err != nil { + // log.Fatalf("PG warmup: %v", err) + // } + // if _, err := runHTTPExec(ctx, true); err != nil { + // log.Fatalf("HTTP warmup: %v", err) + // } + + fmt.Println() + fmt.Println("=== Measurement ===") + qwp, err := runQwp(ctx, false) + if err != nil { + log.Fatalf("QWP: %v", err) + } + pg, err := runPgWire(ctx, false) + if err != nil { + log.Fatalf("PG: %v", err) + } + httpRes, err := runHTTPExec(ctx, false) + if err != nil { + log.Fatalf("HTTP: %v", err) + } + + fmt.Println() + fmt.Println("=== Comparison ===") + fmt.Printf("%-20s %12s %12s %12s\n", "Protocol", "time(ms)", "rows/sec", "MiB/sec") + fmt.Printf("%-20s %12s %12s %12s\n", "--------", "--------", "--------", "-------") + printRow("QWP egress (WS)", qwp) + printRow("PostgreSQL wire", pg) + printRow("HTTP /exec JSON", httpRes) +} + +func mustOK(err error) { + if err != nil { + log.Fatal(err) + } +} + +func printRow(label string, r result) { + secs := r.elapsed.Seconds() + rowsPerSec := float64(r.rows) / secs + mibPerSec := float64(r.bytes) / secs / (1024.0 * 1024.0) + fmt.Printf("%-20s %12d %12.0f %12.2f\n", + label, r.elapsed.Milliseconds(), rowsPerSec, mibPerSec) +} + +// ------------------------------------------------------------------ +// Workload +// ------------------------------------------------------------------ + +func pgConnString() string { + return fmt.Sprintf("postgres://admin:quest@%s:%d/qdb?sslmode=disable", host, pgPort) +} + +func recreateTable(ctx context.Context) error { + // DDL goes through the QWP query channel (Exec) so the bench does + // not need a working PG connection just to set up the table — the + // PG run later will fail loudly if the wire is unreachable, but + // schema management does not have to. + c, err := qdb.NewQwpQueryClient(ctx, qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort))) + if err != nil { + return fmt.Errorf("recreateTable: connect: %w", err) + } + defer c.Close(ctx) + + if _, err := c.Exec(ctx, "DROP TABLE IF EXISTS '"+tableName+"'"); err != nil { + return fmt.Errorf("recreateTable: drop: %w", err) + } + createSQL := "CREATE TABLE '" + tableName + "' (" + + "ts TIMESTAMP, id LONG, price DOUBLE, sym SYMBOL, note VARCHAR" + + ") TIMESTAMP(ts) PARTITION BY HOUR WAL" + if _, err := c.Exec(ctx, createSQL); err != nil { + return fmt.Errorf("recreateTable: create: %w", err) + } + return nil +} + +func ingestRows(ctx context.Context) error { + fmt.Printf("Ingesting %d rows over QWP/WebSocket...\n", rowCount) + start := time.Now() + symbols := []string{"AAPL", "MSFT", "GOOG", "AMZN", "META", "TSLA", "NVDA", "NFLX"} + + conf := fmt.Sprintf("ws::addr=%s:%d;auto_flush_rows=50000;", host, httpPort) + sender, err := qdb.LineSenderFromConf(ctx, conf) + if err != nil { + return fmt.Errorf("ingest: open sender: %w", err) + } + defer sender.Close(ctx) + + for i := int64(1); i <= rowCount; i++ { + // ILP requires all Symbol calls before any non-symbol column setters. + if err := sender.Table(tableName). + Symbol("sym", symbols[i%int64(len(symbols))]). + Int64Column("id", i). + Float64Column("price", float64(i)*1.5). + StringColumn("note", "n"+strconv.FormatInt(i&0xFFF, 10)). + At(ctx, time.UnixMicro(i*10_000)); err != nil { + return fmt.Errorf("ingest: At row %d: %w", i, err) + } + if i%progressEvery == 0 { + fmt.Printf(" %d / %d rows (%d ms)\n", i, rowCount, time.Since(start).Milliseconds()) + } + } + if err := sender.Flush(ctx); err != nil { + return fmt.Errorf("ingest: flush: %w", err) + } + + fmt.Println("Waiting for WAL apply to complete...") + return waitForWalApply(ctx) +} + +func waitForWalApply(ctx context.Context) error { + c, err := qdb.NewQwpQueryClient(ctx, qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort))) + if err != nil { + return fmt.Errorf("wait: connect: %w", err) + } + defer c.Close(ctx) + + deadline := time.Now().Add(5 * time.Minute) + for time.Now().Before(deadline) { + count, err := selectCount(ctx, c) + if err != nil { + return fmt.Errorf("wait: count: %w", err) + } + if count == rowCount { + fmt.Printf(" applied %d rows\n", count) + return nil + } + time.Sleep(500 * time.Millisecond) + } + return errors.New("timed out waiting for WAL apply") +} + +func selectCount(ctx context.Context, c *qdb.QwpQueryClient) (int64, error) { + q := c.Query(ctx, "SELECT count() FROM "+tableName) + defer q.Close() + var count int64 + for batch, err := range q.Batches() { + if err != nil { + return 0, err + } + if batch.RowCount() > 0 { + count = batch.Column(0).Int64(0) + } + } + return count, nil +} + +// ------------------------------------------------------------------ +// QWP egress +// ------------------------------------------------------------------ + +func runQwp(ctx context.Context, warmup bool) (result, error) { + var rowsSeen, bytesSeen, checksum int64 + start := time.Now() + + c, err := qdb.NewQwpQueryClient(ctx, + qdb.WithQwpQueryAddress(fmt.Sprintf("%s:%d", host, httpPort)), + qdb.WithQwpQueryClientID("qwp-egress-bench/1.0"), + qdb.WithQwpQueryCompression("raw"), + ) + if err != nil { + return result{}, err + } + defer c.Close(ctx) + + q := c.Query(ctx, "SELECT ts, id, price, sym, note FROM "+tableName) + defer q.Close() + for batch, err := range q.Batches() { + if err != nil { + return result{}, err + } + n := batch.RowCount() + // Cache the per-column handles once per batch so each cell + // access skips re-deriving the layout pointer — same idiom as + // the Java path that grabs valuesAddr / nonNullIndex up front. + tsCol := batch.Column(0) + idCol := batch.Column(1) + priceCol := batch.Column(2) + symCol := batch.Column(3) + noteCol := batch.Column(4) + for r := 0; r < n; r++ { + ts := tsCol.Int64(r) + id := idCol.Int64(r) + priceBits := int64(math.Float64bits(priceCol.Float64(r))) + sym := symCol.Str(r) + note := noteCol.Str(r) + checksum ^= ts ^ id ^ priceBits ^ int64(len(sym)) ^ int64(len(note)) + } + rowsSeen += int64(n) + // Sum the actual QWP message bytes delivered in this frame + // plus a 10-byte WebSocket-header approximation, matching the + // calculation in Java benchmark, so the bytes/sec column is comparable. + bytesSeen += int64(len(batch.Payload())) + 10 + } + elapsed := time.Since(start) + logRun("QWP", warmup, elapsed, rowsSeen, fmt.Sprintf("0x%x", uint64(checksum))) + return result{elapsed: elapsed, rows: rowsSeen, bytes: bytesSeen}, nil +} + +// ------------------------------------------------------------------ +// PostgreSQL wire +// ------------------------------------------------------------------ + +func runPgWire(ctx context.Context, warmup bool) (result, error) { + var rows, checksum, bytes int64 + start := time.Now() + + cfg, err := pgx.ParseConfig(pgConnString()) + if err != nil { + return result{}, err + } + conn, err := pgx.ConnectConfig(ctx, cfg) + if err != nil { + return result{}, err + } + defer conn.Close(ctx) + + qrows, err := conn.Query(ctx, "SELECT ts, id, price, sym, note FROM "+tableName) + if err != nil { + return result{}, err + } + defer qrows.Close() + + for qrows.Next() { + var ts time.Time + var id int64 + var price float64 + var sym, note string + if err := qrows.Scan(&ts, &id, &price, &sym, ¬e); err != nil { + return result{}, err + } + // Normalise to epoch microseconds so the checksum matches the + // QWP path. Java's getTimestamp().getTime()*1000 truncates to + // ms*1000; QuestDB's micros are 10ms-aligned in this dataset + // so both forms agree. + tsMicros := ts.UnixMicro() + priceBits := int64(math.Float64bits(price)) + checksum ^= tsMicros ^ id ^ priceBits ^ int64(len(sym)) ^ int64(len(note)) + // PG DataRow wire size per row in binary mode: 1 byte 'D' msg + // tag, 4 bytes msg length, 2 bytes col count, then a 4-byte + // length prefix + value for each of the 5 columns. ts/id/price + // are 8 bytes each. + bytes += 7 + 5*4 + 8*3 + int64(len(sym)) + int64(len(note)) + rows++ + } + if err := qrows.Err(); err != nil { + return result{}, err + } + elapsed := time.Since(start) + logRun("PG", warmup, elapsed, rows, fmt.Sprintf("0x%x", uint64(checksum))) + return result{elapsed: elapsed, rows: rows, bytes: bytes}, nil +} + +// ------------------------------------------------------------------ +// HTTP /exec JSON +// ------------------------------------------------------------------ + +func runHTTPExec(ctx context.Context, warmup bool) (result, error) { + var bytes int64 + start := time.Now() + + sql := "SELECT ts,id,price,sym,note FROM " + tableName + u := fmt.Sprintf("http://%s:%d/exec?query=%s&count=true", + host, httpPort, url.QueryEscape(sql)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return result{}, err + } + req.Header.Set("Accept-Encoding", "identity") + resp, err := http.DefaultClient.Do(req) + if err != nil { + return result{}, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return result{}, fmt.Errorf("HTTP /exec: status %s", resp.Status) + } + + // JSON response is one line with {"columns":[...],"dataset":[[...],...]}. + // Scan for '[' to count rows — same approximation as the Java path. + var brackets int64 + buf := make([]byte, 16*1024) + for { + n, err := resp.Body.Read(buf) + if n > 0 { + bytes += int64(n) + for i := 0; i < n; i++ { + if buf[i] == '[' { + brackets++ + } + } + } + if err == io.EOF { + break + } + if err != nil { + return result{}, err + } + } + // Brackets counter incremented for every '[' including the outer + // "columns" wrapper and the "dataset" wrapper; subtract those two. + var rows int64 + if brackets > 1 { + rows = brackets - 2 + } + elapsed := time.Since(start) + logRun("HTTP", warmup, elapsed, rows, strconv.FormatInt(bytes, 10)) + return result{elapsed: elapsed, rows: rows, bytes: bytes}, nil +} + +// ------------------------------------------------------------------ +// Helpers +// ------------------------------------------------------------------ + +func logRun(label string, warmup bool, elapsed time.Duration, rows int64, suffix string) { + phase := "[measure]" + if warmup { + phase = "[warmup]" + } + fmt.Printf("%s %s : %d rows in %d ms (checksum/bytes=%s)\n", + phase, label, rows, elapsed.Milliseconds(), suffix) +} diff --git a/conf_audit_test.go b/conf_audit_test.go new file mode 100644 index 00000000..80a83181 --- /dev/null +++ b/conf_audit_test.go @@ -0,0 +1,490 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "strings" + "testing" + "time" +) + +// TestConfQwpwsAlias pins the qwpws / qwpwss long-form schema aliases +// from connect-string.md §Protocols and transports: "qwpws / qwpwss +// are accepted as long-form aliases for ws / wss." Same TLS mode and +// transport selection as the short forms. +func TestConfQwpwsAlias(t *testing.T) { + cases := []struct { + schema string + wantTLS tlsMode + }{ + {"ws", tlsDisabled}, + {"qwpws", tlsDisabled}, + {"wss", tlsEnabled}, + {"qwpwss", tlsEnabled}, + } + for _, tc := range cases { + t.Run(tc.schema, func(t *testing.T) { + // Ingest parser. + c, err := confFromStr(tc.schema + "::addr=localhost:9000;") + if err != nil { + t.Fatalf("ingest %s parse: %v", tc.schema, err) + } + if c.senderType != qwpSenderType { + t.Errorf("ingest %s senderType=%v, want qwpSenderType", tc.schema, c.senderType) + } + if c.tlsMode != tc.wantTLS { + t.Errorf("ingest %s tlsMode=%v, want %v", tc.schema, c.tlsMode, tc.wantTLS) + } + + // Egress parser. + qc, err := parseQwpQueryConf(tc.schema + "::addr=localhost:9000;") + if err != nil { + t.Fatalf("egress %s parse: %v", tc.schema, err) + } + if qc.tlsMode != tc.wantTLS { + t.Errorf("egress %s tlsMode=%v, want %v", tc.schema, qc.tlsMode, tc.wantTLS) + } + }) + } +} + +// TestConfQwpwsAliasUnknownSchemaErrorMentionsAliases pins the +// improved error message — a typo like "wsq::" should mention all four +// accepted schemas on the egress side so the user knows the long form +// is also valid. +func TestConfQwpwsAliasUnknownSchemaErrorMentionsAliases(t *testing.T) { + _, err := parseQwpQueryConf("wsq::addr=a:1;") + if err == nil { + t.Fatal("expected error for wsq::") + } + msg := err.Error() + for _, want := range []string{"ws", "wss", "qwpws", "qwpwss"} { + if !strings.Contains(msg, want) { + t.Errorf("error %q does not contain %q", msg, want) + } + } +} + +// TestConfSizeSuffix pins the JVM-style 1024-based size-suffix grammar +// from connect-string.md §Size suffixes. Suffixes are case- +// insensitive and the long forms (kb/mb/gb/tb) match the short forms +// (k/m/g/t). +func TestConfSizeSuffix(t *testing.T) { + cases := []struct { + input string + want int64 + }{ + {"0", 0}, + {"1024", 1024}, + {"1k", 1 << 10}, + {"1K", 1 << 10}, + {"1kb", 1 << 10}, + {"1KB", 1 << 10}, + {"4m", 4 << 20}, + {"4M", 4 << 20}, + {"4mb", 4 << 20}, + {"8m", 8 << 20}, + {"1g", 1 << 30}, + {"1G", 1 << 30}, + {"1gb", 1 << 30}, + {"10g", 10 << 30}, + {"1t", 1 << 40}, + {"1tb", 1 << 40}, + } + for _, tc := range cases { + t.Run(tc.input, func(t *testing.T) { + got, err := parseSizeBytes(tc.input) + if err != nil { + t.Fatalf("parseSizeBytes(%q): %v", tc.input, err) + } + if got != tc.want { + t.Errorf("parseSizeBytes(%q) = %d, want %d", tc.input, got, tc.want) + } + }) + } +} + +// TestConfSizeSuffixRejected covers shapes that must error. +func TestConfSizeSuffixRejected(t *testing.T) { + cases := []string{ + "", + "k", // suffix without a number + "abc", // non-numeric + "1.5m", // floats not supported + "-1", // negative bare + "-1m", // negative with suffix + "1xb", // unknown suffix + "1024kb extra", // trailing garbage + } + for _, in := range cases { + t.Run(in, func(t *testing.T) { + if _, err := parseSizeBytes(in); err == nil { + t.Errorf("parseSizeBytes(%q) expected error", in) + } + }) + } +} + +// TestConfSizeSuffixAppliedToKeys verifies the suffix grammar is +// wired into the size-typed connect-string keys end-to-end. +func TestConfSizeSuffixAppliedToKeys(t *testing.T) { + c, err := confFromStr("ws::addr=localhost:9000;" + + "init_buf_size=128k;" + + "max_buf_size=10m;" + + "auto_flush_bytes=4m;" + + "sf_dir=/tmp/sf;sender_id=t;" + + "sf_max_bytes=4m;" + + "sf_max_total_bytes=1g;") + if err != nil { + t.Fatalf("parse: %v", err) + } + if c.initBufSize != 128<<10 { + t.Errorf("initBufSize=%d, want %d", c.initBufSize, 128<<10) + } + if c.maxBufSize != 10<<20 { + t.Errorf("maxBufSize=%d, want %d", c.maxBufSize, 10<<20) + } + if c.autoFlushBytes != 4<<20 { + t.Errorf("autoFlushBytes=%d, want %d", c.autoFlushBytes, 4<<20) + } + if c.sfMaxBytes != int64(4<<20) { + t.Errorf("sfMaxBytes=%d, want %d", c.sfMaxBytes, 4<<20) + } + if c.sfMaxTotalBytes != int64(1<<30) { + t.Errorf("sfMaxTotalBytes=%d, want %d", c.sfMaxTotalBytes, 1<<30) + } +} + +// TestConfSenderIdRejectsDot pins the spec + Java charset: sender_id +// must not contain '.'. Sender.java validateSenderId allows only +// letters / digits / '_' / '-'. The legacy permissive validator +// allowed '.', which deviated from the spec's "no path separators, +// no '.', no spaces" rule. +func TestConfSenderIdRejectsDot(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sender_id=foo.bar;") + if err == nil { + t.Fatal("expected error for sender_id with '.'") + } + msg := err.Error() + if !strings.Contains(msg, "sender_id") { + t.Errorf("error %q does not name sender_id", msg) + } + if !strings.Contains(msg, ".") { + t.Errorf("error %q does not show the offending char", msg) + } +} + +// TestConfSenderIdAccepted pins the allowed character set so future +// changes don't accidentally regress. +func TestConfSenderIdAccepted(t *testing.T) { + for _, id := range []string{"a", "Z", "0", "abc-DEF_123", "a_b-c"} { + t.Run(id, func(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sender_id=" + id + ";") + if err != nil { + t.Fatalf("unexpected error for %q: %v", id, err) + } + }) + } +} + +// TestConfQwpAutoFlushBytesDefault pins the spec default of 8 MiB. +// connect-string.md §Auto-flushing: "Default where supported: `8m` +// (8 MiB)." Without this, byte-triggered auto-flush is silently +// disabled on the Go client. +func TestConfQwpAutoFlushBytesDefault(t *testing.T) { + c, err := confFromStr("ws::addr=localhost:9000;") + if err != nil { + t.Fatalf("parse: %v", err) + } + if c.autoFlushBytes != qwpDefaultAutoFlushBytes { + t.Errorf("autoFlushBytes=%d, want %d (8 MiB)", + c.autoFlushBytes, qwpDefaultAutoFlushBytes) + } + if qwpDefaultAutoFlushBytes != 8<<20 { + t.Errorf("qwpDefaultAutoFlushBytes=%d, want %d", qwpDefaultAutoFlushBytes, 8<<20) + } +} + +// TestConfQwpAutoFlushOffZerosBytes pins that `auto_flush=off` also +// clears the new byte default (otherwise users who disable auto-flush +// would still see byte-triggered flushes). +func TestConfQwpAutoFlushOffZerosBytes(t *testing.T) { + c, err := confFromStr("ws::addr=localhost:9000;auto_flush=off;") + if err != nil { + t.Fatalf("parse: %v", err) + } + if c.autoFlushBytes != 0 { + t.Errorf("auto_flush=off left autoFlushBytes=%d, want 0", c.autoFlushBytes) + } +} + +// TestConfIngestSilentlyAcceptsEgressKeys is the cross-direction +// silent-accept contract from connect-string.md §16-20 and §Query +// client keys: a ws:: / wss:: Sender must not error on egress-only +// keys, because the same connect string must be shareable with a +// QwpQueryClient. +func TestConfIngestSilentlyAcceptsEgressKeys(t *testing.T) { + // One representative value per egress-only key. Values are + // intentionally a mix of valid and "garbage-from-the-Sender's- + // perspective" forms — the spec says the Sender does not + // interpret them, so even invalid values must pass. + kvs := []string{ + "buffer_pool_size=8", + "compression=zstd", + "compression_level=22", + "failover=off", + "failover_backoff_initial_ms=10", + "failover_backoff_max_ms=2000", + "failover_max_attempts=16", + "failover_max_duration_ms=60000", + "initial_credit=262144", + "max_batch_rows=10000", + } + for _, kv := range kvs { + t.Run(kv, func(t *testing.T) { + conf := "ws::addr=localhost:9000;" + kv + ";" + if _, err := confFromStr(conf); err != nil { + t.Errorf("unexpected error parsing %q: %v", conf, err) + } + }) + } +} + +// TestConfIngestRejectsEgressKeysOnHttp pins that the silent-accept +// is QWP-only — HTTP/TCP senders do not share a connect string with +// QwpQueryClient, so egress-only keys must still error there. +func TestConfIngestRejectsEgressKeysOnHttp(t *testing.T) { + _, err := confFromStr("http::addr=localhost:9000;compression=zstd;") + if err == nil { + t.Fatal("expected error: compression on http:: must not be silently accepted") + } + if !strings.Contains(err.Error(), "unsupported option") { + t.Errorf("error %q does not say unsupported option", err.Error()) + } +} + +// TestConfEgressSilentlyAcceptsIngressKeys is the egress side of the +// shared-connect-string contract. A QwpQueryClient must not error on +// ingress-only keys (sf_*, reconnect_*, auto_flush_*, on_*_error, +// etc.). +func TestConfEgressSilentlyAcceptsIngressKeys(t *testing.T) { + kvs := []string{ + "auto_flush=on", + "auto_flush_bytes=8m", + "auto_flush_interval=100", + "auto_flush_rows=1000", + "close_flush_timeout_millis=5000", + "drain_orphans=off", + "durable_ack_keepalive_interval_millis=200", + "error_inbox_capacity=256", + "init_buf_size=64k", + "initial_connect_retry=off", + "max_background_drainers=4", + "max_buf_size=100m", + "max_name_len=127", + "on_internal_error=halt", + "on_parse_error=halt", + "on_schema_error=halt", + "on_security_error=halt", + "on_server_error=auto", + "on_write_error=halt", + "reconnect_initial_backoff_millis=100", + "reconnect_max_backoff_millis=5000", + "reconnect_max_duration_millis=300000", + "request_durable_ack=off", + "sender_id=ingest-1", + "sf_append_deadline_millis=30000", + "sf_dir=/tmp/sf", + "sf_durability=memory", + "sf_max_bytes=4m", + "sf_max_total_bytes=10g", + } + for _, kv := range kvs { + t.Run(kv, func(t *testing.T) { + conf := "ws::addr=localhost:9000;" + kv + ";" + if _, err := parseQwpQueryConf(conf); err != nil { + t.Errorf("unexpected error parsing %q: %v", conf, err) + } + }) + } +} + +// TestConfSharedConnectString is the end-to-end check on the +// shared-connect-string contract: one string with both ingress-only +// and egress-only keys must parse successfully on both sides. +func TestConfSharedConnectString(t *testing.T) { + shared := "wss::addr=db-a:9000,db-b:9000;" + + "token=my-token;" + + "target=primary;" + + "zone=eu-west-1;" + + // ingress-only: + "sf_dir=/tmp/sf;sender_id=ingest-1;auto_flush_rows=500;" + + "reconnect_max_duration_millis=120000;" + + "on_schema_error=drop;" + + // egress-only: + "compression=zstd;compression_level=3;" + + "failover_max_attempts=8;failover_max_duration_ms=30000;" + if _, err := confFromStr(shared); err != nil { + t.Errorf("ingest parser rejected the shared connect string: %v", err) + } + if _, err := parseQwpQueryConf(shared); err != nil { + t.Errorf("egress parser rejected the shared connect string: %v", err) + } +} + +// TestConfQwpRejectsRetryTimeout pins the fix for the silent-drop +// audit finding: retry_timeout is HTTP-only (legacy ILP doc) and is +// not listed in connect-string.md, and Sender.java:3412 rejects it +// on the WebSocket protocol. The Go QWP sanitizer now rejects too, +// pointing the user at the QWP analogue. +func TestConfQwpRejectsRetryTimeout(t *testing.T) { + for _, schema := range []string{"ws", "wss", "qwpws", "qwpwss"} { + t.Run(schema, func(t *testing.T) { + _, err := LineSenderFromConf(context.Background(), + schema+"::addr=localhost:9000;retry_timeout=10000;") + if err == nil { + t.Fatal("expected error: retry_timeout must not be accepted on QWP") + } + msg := err.Error() + if !strings.Contains(msg, "retry_timeout") { + t.Errorf("error %q does not name retry_timeout", msg) + } + if !strings.Contains(msg, "reconnect_max_duration_millis") { + t.Errorf("error %q does not point to the QWP analogue", msg) + } + }) + } +} + +// TestConfQwpRejectsWithRetryTimeoutOption pins the same reject on +// the functional-option path so users who reach for WithRetryTimeout +// on a QWP sender get the same error as the connect-string path. +// (The WithRetryTimeout doc comment already says "Only available for +// the HTTP sender"; this is the enforcement.) +func TestConfQwpRejectsWithRetryTimeoutOption(t *testing.T) { + _, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress("localhost:9000"), + WithRetryTimeout(5*time.Second)) + if err == nil { + t.Fatal("expected error: WithRetryTimeout must not be accepted on QWP") + } + if !strings.Contains(err.Error(), "retry_timeout") { + t.Errorf("error %q does not name retry_timeout", err.Error()) + } +} + +// TestConfRejectsCloseTimeoutWithMigrationHint pins the removal of +// the Go-only `close_timeout` key. Java never accepted it (only +// close_flush_timeout_millis, Sender.java §3071), and the cursor +// architecture unified the memory and SF close paths onto the +// spec-aligned key. The parser rejects regardless of schema with a +// migration hint, not the generic "unsupported option" error. +func TestConfRejectsCloseTimeoutWithMigrationHint(t *testing.T) { + for _, schema := range []string{"ws", "wss", "qwpws", "qwpwss", "http", "https", "tcp", "tcps"} { + t.Run(schema, func(t *testing.T) { + _, err := confFromStr(schema + "::addr=localhost:9000;close_timeout=1000;") + if err == nil { + t.Fatal("expected error: close_timeout must not be accepted") + } + msg := err.Error() + if !strings.Contains(msg, "close_timeout") { + t.Errorf("error %q does not name close_timeout", msg) + } + if !strings.Contains(msg, "close_flush_timeout_millis") { + t.Errorf("error %q does not point at close_flush_timeout_millis", msg) + } + }) + } +} + +// TestConfMemoryModeHonoursCloseFlushTimeout pins the unification: +// memory mode (no sf_dir) now reads close_flush_timeout_millis, not +// the removed close_timeout. With the cursor architecture sharing +// the same engine across memory and SF modes, both keys map to the +// same runtime field via the spec-aligned name. +func TestConfMemoryModeHonoursCloseFlushTimeout(t *testing.T) { + c, err := confFromStr("ws::addr=localhost:9000;close_flush_timeout_millis=2500;") + if err != nil { + t.Fatalf("parse: %v", err) + } + if !c.closeFlushTimeoutSet { + t.Error("closeFlushTimeoutSet=false after explicit user value") + } + if c.closeFlushTimeoutMillis != 2500 { + t.Errorf("closeFlushTimeoutMillis=%d, want 2500", c.closeFlushTimeoutMillis) + } +} + +// TestWithCloseTimeoutSubMillisecondIsNoOverride pins that the +// deprecated alias honours its documented "d <= 0 is treated as no +// override" semantics for sub-millisecond positive durations too. +// Without this gate, d=500µs satisfies d > 0, truncates to 0 ms, +// sets closeFlushTimeoutSet=true, and routes into the fast-close +// branch (qwp_sender_cursor.go:167, sender.go:1493), contradicting +// the doc. Callers who actually want fast-close must opt in via +// WithCloseFlushTimeout. +func TestWithCloseTimeoutSubMillisecondIsNoOverride(t *testing.T) { + for _, d := range []time.Duration{ + 0, + -1 * time.Second, + 1 * time.Nanosecond, + 500 * time.Microsecond, + 999 * time.Microsecond, + } { + t.Run(d.String(), func(t *testing.T) { + c := newLineSenderConfig(qwpSenderType) + WithCloseTimeout(d)(c) + if c.closeFlushTimeoutSet { + t.Errorf("closeFlushTimeoutSet=true for d=%s; want no override", d) + } + if c.closeFlushTimeoutMillis != 0 { + t.Errorf("closeFlushTimeoutMillis=%d for d=%s; want 0 (untouched)", c.closeFlushTimeoutMillis, d) + } + }) + } + // Sanity: the smallest representable positive value, 1ms, must + // still override (the gate is inclusive at the ms boundary). + c := newLineSenderConfig(qwpSenderType) + WithCloseTimeout(time.Millisecond)(c) + if !c.closeFlushTimeoutSet || c.closeFlushTimeoutMillis != 1 { + t.Errorf("WithCloseTimeout(1ms): set=%v millis=%d; want set=true millis=1", + c.closeFlushTimeoutSet, c.closeFlushTimeoutMillis) + } +} + +// TestConfRejectsUnknownKeyOnBothSides confirms that a genuinely +// unknown key (not in either spec set) still errors out, so the +// silent-accept is scoped. +func TestConfRejectsUnknownKeyOnBothSides(t *testing.T) { + bad := "ws::addr=localhost:9000;not_a_real_key=42;" + if _, err := confFromStr(bad); err == nil { + t.Error("ingest: expected error for not_a_real_key") + } + if _, err := parseQwpQueryConf(bad); err == nil { + t.Error("egress: expected error for not_a_real_key") + } +} diff --git a/conf_parse.go b/conf_parse.go index df2e2b76..664cb8e6 100644 --- a/conf_parse.go +++ b/conf_parse.go @@ -36,6 +36,66 @@ type configData struct { KeyValuePairs map[string]string } +// egressOnlyKeys lists connect-string keys defined by the spec for the +// QwpQueryClient (egress) only. The ingress LineSender silently +// accepts them when the schema is ws:: / wss:: so that one connect +// string can drive both Sender and QwpQueryClient — per +// connect-string.md §16-20 ("Each direction reads the keys relevant +// to it and ignores keys meant only for the other direction") and +// §Query client keys ("The Sender (ingress) silently consumes the +// same keys ... the Sender does not interpret the values"). Range, +// enum, and type checks for these keys happen on the egress side +// only. +var egressOnlyKeys = map[string]bool{ + "buffer_pool_size": true, + "compression": true, + "compression_level": true, + "failover": true, + "failover_backoff_initial_ms": true, + "failover_backoff_max_ms": true, + "failover_max_attempts": true, + "failover_max_duration_ms": true, + "initial_credit": true, + "max_batch_rows": true, +} + +// ingressOnlyKeys lists connect-string keys defined by the spec for +// the ingress LineSender only. The egress QwpQueryClient silently +// accepts them so a shared connect string works in both directions. +// Same SSOT as egressOnlyKeys; the lists are kept in sync with +// connect-string.md §Key index. +var ingressOnlyKeys = map[string]bool{ + "auto_flush": true, + "auto_flush_bytes": true, + "auto_flush_interval": true, + "auto_flush_rows": true, + "close_flush_timeout_millis": true, + "drain_orphans": true, + "durable_ack_keepalive_interval_millis": true, + "error_inbox_capacity": true, + "init_buf_size": true, + "initial_connect_retry": true, + "max_background_drainers": true, + "max_buf_size": true, + "max_name_len": true, + "on_internal_error": true, + "on_parse_error": true, + "on_schema_error": true, + "on_security_error": true, + "on_server_error": true, + "on_write_error": true, + "reconnect_initial_backoff_millis": true, + "reconnect_max_backoff_millis": true, + "reconnect_max_duration_millis": true, + "request_durable_ack": true, + "sender_id": true, + "sf_append_deadline_millis": true, + "sf_dir": true, + "sf_durability": true, + "sf_max_bytes": true, + "sf_max_total_bytes": true, +} + func confFromStr(conf string) (*lineSenderConfig, error) { var senderConf *lineSenderConfig @@ -55,9 +115,12 @@ func confFromStr(conf string) (*lineSenderConfig, error) { case "tcps": senderConf = newLineSenderConfig(tcpSenderType) senderConf.tlsMode = tlsEnabled - case "ws": + case "ws", "qwpws": + // connect-string.md §Protocols and transports: qwpws is a + // long-form alias for ws. Same TLS mode (disabled), same + // transport selection. senderConf = newLineSenderConfig(qwpSenderType) - case "wss": + case "wss", "qwpwss": senderConf = newLineSenderConfig(qwpSenderType) senderConf.tlsMode = tlsEnabled default: @@ -125,30 +188,38 @@ func confFromStr(conf string) (*lineSenderConfig, error) { } senderConf.autoFlushInterval = time.Duration(parsedVal) * time.Millisecond case "auto_flush_bytes": + senderConf.autoFlushBytesSet = true if v == "off" { senderConf.autoFlushBytes = 0 continue } - parsedVal, err := strconv.Atoi(v) + parsedVal, err := parseSizeBytes(v) if err != nil { - return nil, NewInvalidConfigStrError("invalid %s value, %q is not a valid int", k, v) + return nil, NewInvalidConfigStrError("invalid %s value, %q: %v", k, v, err) + } + senderConf.autoFlushBytes = int(parsedVal) + case "init_buf_size", "max_buf_size": + // Size-typed (connect-string.md §Size suffixes); accept + // JVM-style k/kb/m/mb/g/gb/t/tb suffixes alongside bare + // bytes. + parsedVal, err := parseSizeBytes(v) + if err != nil { + return nil, NewInvalidConfigStrError("invalid %s value, %q: %v", k, v, err) + } + if k == "init_buf_size" { + senderConf.initBufSize = int(parsedVal) + } else { + senderConf.maxBufSize = int(parsedVal) } - senderConf.autoFlushBytes = parsedVal - case "request_min_throughput", "init_buf_size", "max_buf_size": + case "request_min_throughput", "max_name_len": parsedVal, err := strconv.Atoi(v) if err != nil { return nil, NewInvalidConfigStrError("invalid %s value, %q is not a valid int", k, v) } - - switch k { - case "request_min_throughput": + if k == "request_min_throughput" { senderConf.minThroughput = parsedVal - case "init_buf_size": - senderConf.initBufSize = parsedVal - case "max_buf_size": - senderConf.maxBufSize = parsedVal - default: - panic("add a case for " + k) + } else { + senderConf.fileNameLimit = parsedVal } case "request_timeout", "retry_timeout": timeout, err := strconv.Atoi(v) @@ -200,6 +271,157 @@ func confFromStr(conf string) (*lineSenderConfig, error) { } senderConf.inFlightWindow = parsedVal case "close_timeout": + // Java client never accepted close_timeout — only + // close_flush_timeout_millis (Sender.java §3071). The + // legacy Go-only key was a v4.0-era memory-mode knob; + // the cursor architecture (CLAUDE.md) unified memory and + // SF paths onto close_flush_timeout_millis. Reject with + // a migration hint rather than silently dropping or + // going through the generic "unsupported option" path. + return nil, NewInvalidConfigStrError( + "close_timeout is no longer supported; use close_flush_timeout_millis instead") + case "gorilla": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + switch v { + case "on": + senderConf.gorillaDisabled = false + case "off": + senderConf.gorillaDisabled = true + default: + return nil, NewInvalidConfigStrError("invalid gorilla value, %q is not 'on' or 'off'", v) + } + case "sf_dir": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + senderConf.sfDir = v + case "sender_id": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + if err := validateSenderId(v); err != nil { + return nil, err + } + senderConf.senderId = v + case "sf_max_bytes": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := parseSizeBytes(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive size", k, v) + } + senderConf.sfMaxBytes = parsedVal + case "sf_max_total_bytes": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := parseSizeBytes(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive size", k, v) + } + senderConf.sfMaxTotalBytes = parsedVal + case "sf_durability": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + if err := validateSfDurability(v); err != nil { + return nil, err + } + senderConf.sfDurability = v + case "sf_append_deadline_millis": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive int (milliseconds)", k, v) + } + senderConf.sfAppendDeadlineMillis = parsedVal + case "auth_timeout_ms": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive int (milliseconds)", k, v) + } + senderConf.authTimeoutMs = parsedVal + case "zone": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + // Egress consumes this via the (state, zone) priority + // lattice (failover.md §2); the ingestion path does not + // route by zone, so the value never reaches the SF tracker. + // Silently accepted on both so a single connect string works + // across ingress and egress clients without per-startup + // noise. + senderConf.zone = v + case "target": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + // Egress consumes this as the connect-walk role filter, + // matching against the server's advertised role. The + // ingestion path does not route by role (role-based + // endpoint selection is egress-only), so target is accepted + // but inert on ingestion, symmetric with zone above. Parsed + // here so a malformed value is still rejected on both paths. + t, err := parseTargetFilter(v) + if err != nil { + return nil, NewInvalidConfigStrError("%v", err) + } + senderConf.target = t + case "reconnect_max_duration_millis": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal < 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a non-negative int (milliseconds)", k, v) + } + senderConf.reconnectMaxDurationMillis = parsedVal + senderConf.reconnectMaxDurationMillisSet = true + case "reconnect_initial_backoff_millis": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive int (milliseconds)", k, v) + } + senderConf.reconnectInitialBackoffMillis = parsedVal + senderConf.reconnectInitialBackoffMillisSet = true + case "reconnect_max_backoff_millis": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal <= 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a positive int (milliseconds)", k, v) + } + senderConf.reconnectMaxBackoffMillis = parsedVal + senderConf.reconnectMaxBackoffMillisSet = true + case "initial_connect_retry": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + switch v { + case "on", "true", "sync": + senderConf.initialConnectMode = InitialConnectSync + case "off", "false": + senderConf.initialConnectMode = InitialConnectOff + case "async": + senderConf.initialConnectMode = InitialConnectAsync + default: + return nil, NewInvalidConfigStrError( + "invalid %s value, %q is not 'on' / 'off' / 'true' / 'false' / 'sync' / 'async'", k, v) + } + senderConf.initialConnectModeSet = true + case "close_flush_timeout_millis": if senderConf.senderType != qwpSenderType { return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) } @@ -207,8 +429,60 @@ func confFromStr(conf string) (*lineSenderConfig, error) { if err != nil { return nil, NewInvalidConfigStrError("invalid %s value, %q is not a valid int (milliseconds)", k, v) } - senderConf.closeTimeout = time.Duration(parsedVal) * time.Millisecond - case "max_schemas_per_connection": + senderConf.closeFlushTimeoutSet = true + senderConf.closeFlushTimeoutMillis = parsedVal + case "drain_orphans": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + switch v { + case "on", "true": + senderConf.drainOrphans = true + case "off", "false": + senderConf.drainOrphans = false + default: + return nil, NewInvalidConfigStrError( + "invalid %s value, %q is not 'on' / 'off' / 'true' / 'false'", k, v) + } + case "max_background_drainers": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + parsedVal, err := strconv.Atoi(v) + if err != nil || parsedVal < 0 { + return nil, NewInvalidConfigStrError("invalid %s value, %q must be a non-negative int", k, v) + } + senderConf.maxBackgroundDrainers = parsedVal + case "on_server_error": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + pol, err := parseErrorPolicyValue(k, v, true) + if err != nil { + return nil, err + } + senderConf.errorPolicyGlobal = pol + case "on_schema_error": + if err := setPerCategoryPolicy(senderConf, k, v, CategorySchemaMismatch); err != nil { + return nil, err + } + case "on_parse_error": + if err := setPerCategoryPolicy(senderConf, k, v, CategoryParseError); err != nil { + return nil, err + } + case "on_internal_error": + if err := setPerCategoryPolicy(senderConf, k, v, CategoryInternalError); err != nil { + return nil, err + } + case "on_security_error": + if err := setPerCategoryPolicy(senderConf, k, v, CategorySecurityError); err != nil { + return nil, err + } + case "on_write_error": + if err := setPerCategoryPolicy(senderConf, k, v, CategoryWriteError); err != nil { + return nil, err + } + case "error_inbox_capacity": if senderConf.senderType != qwpSenderType { return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) } @@ -216,20 +490,64 @@ func confFromStr(conf string) (*lineSenderConfig, error) { if err != nil { return nil, NewInvalidConfigStrError("invalid %s value, %q is not a valid int", k, v) } - senderConf.maxSchemasPerConnection = parsedVal - case "gorilla": + if parsedVal < qwpSfMinErrorInboxCapacity { + return nil, NewInvalidConfigStrError( + "invalid %s value, %d: must be >= %d", + k, parsedVal, qwpSfMinErrorInboxCapacity) + } + senderConf.errorInboxCapacity = parsedVal + case "request_durable_ack": if senderConf.senderType != qwpSenderType { + // sf-client.md §4.6 mandates rejecting + // request_durable_ack=on on non-WebSocket transports. + // QWP (ws/wss) is the only WebSocket transport here, so + // a non-QWP sender can never honour it -- reject the key + // outright, consistent with every other SF key. return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) } switch v { - case "on": - senderConf.gorillaDisabled = false - case "off": - senderConf.gorillaDisabled = true + case "off", "false": + // The default. Non-durable, OK-driven trim is fully + // conformant (sf-client.md §9.2 / §19); nothing to wire. + case "on", "true": + // Durable-ack mode (sf-client.md §4.3 / §8.1 / §9.3 / + // §10 / §11) is a deferred opt-in, EE-only QoS feature: + // the cursor send loop OK-trims and silently ignores + // DURABLE_ACK frames (qwp_sf_send_loop.go). §19 makes + // the key normative so we accept it, but opting in is + // rejected with a clear deferred-feature message rather + // than the generic "unsupported option", mirroring + // sf_durability=flush. + return nil, NewInvalidConfigStrError( + "request_durable_ack=%s is not yet supported: durable-ack mode is not implemented in this client (deferred follow-up; use request_durable_ack=off)", v) default: - return nil, NewInvalidConfigStrError("invalid gorilla value, %q is not 'on' or 'off'", v) + return nil, NewInvalidConfigStrError( + "invalid %s value, %q is not 'on' / 'off' / 'true' / 'false'", k, v) + } + case "durable_ack_keepalive_interval_millis": + if senderConf.senderType != qwpSenderType { + return nil, NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + // Accepted for connect-string portability (sf-client.md + // §4.3 / §19) but inert: it only paces keepalive PINGs in + // durable-ack mode, which this client does not implement + // (see request_durable_ack). Validate the shape so a typo + // still errors helpfully; 0 / negative mean "disabled" per + // spec, so any int is in range. + if _, err := strconv.Atoi(v); err != nil { + return nil, NewInvalidConfigStrError( + "invalid %s value, %q is not a valid int (milliseconds)", k, v) } default: + if senderConf.senderType == qwpSenderType && egressOnlyKeys[k] { + // Silently accepted on ingress so a single ws:: / wss:: + // connect string can configure both Sender and + // QwpQueryClient. The Sender does not interpret the + // value — range/enum/type checks run on the egress side + // (qwp_query_conf.go). connect-string.md §16-20 and + // §Query client keys are the load-bearing spec text. + continue + } return nil, NewInvalidConfigStrError("unsupported option %q", k) } } @@ -237,6 +555,140 @@ func confFromStr(conf string) (*lineSenderConfig, error) { return senderConf, nil } +// parseErrorPolicyValue parses a connect-string Policy value. When +// allowAuto is true, "auto" is accepted (used by the global +// on_server_error key whose default semantic is "use the per-category +// table"); per-category keys reject "auto" because the sentinel is +// only meaningful at the global layer. +func parseErrorPolicyValue(k, v string, allowAuto bool) (Policy, error) { + switch v { + case "halt": + return PolicyHalt, nil + case "drop": + return PolicyDropAndContinue, nil + case "auto": + if allowAuto { + return PolicyAuto, nil + } + } + if allowAuto { + return PolicyAuto, NewInvalidConfigStrError( + "invalid %s value, %q is not 'auto' / 'halt' / 'drop'", k, v) + } + return PolicyAuto, NewInvalidConfigStrError( + "invalid %s value, %q is not 'halt' / 'drop'", k, v) +} + +// setPerCategoryPolicy parses v as a Policy and stores it on the +// per-category override slot for c, gating to QWP and setting the +// per-category-set flag for sanitizer routing. +func setPerCategoryPolicy(conf *lineSenderConfig, k, v string, c Category) error { + if conf.senderType != qwpSenderType { + return NewInvalidConfigStrError("%s is only supported for QWP senders", k) + } + pol, err := parseErrorPolicyValue(k, v, false) + if err != nil { + return err + } + conf.errorPolicyPerCat[c] = pol + conf.errorPolicyPerCatSet = true + return nil +} + +// validateSfDurability checks an sf_durability value. The empty +// string means "unset" (defaults to memory at construction); only +// "memory" is currently honoured. "flush" / "append" are reserved +// for a deferred follow-up and rejected with a pointer to the +// supported value. Shared by the connect-string parser and +// sanitizeQwpConf so the WithSfDurability functional-option path +// rejects identically — single source of truth for the value space. +func validateSfDurability(v string) error { + switch v { + case "", "memory": + return nil + case "flush", "append": + return NewInvalidConfigStrError( + "sf_durability=%s is not yet supported (deferred follow-up; use sf_durability=memory)", v) + default: + return NewInvalidConfigStrError( + "invalid sf_durability value, %q is not 'memory' (other values reserved for future use)", v) + } +} + +// parseSizeBytes parses a size-typed connect-string value: a non- +// negative decimal integer optionally followed by a JVM-style 1024- +// based size suffix. connect-string.md §Size suffixes: suffixes are +// case-insensitive (k / kb / m / mb / g / gb / t / tb). Plain +// integers (no suffix) are parsed as bytes. Returns an error for +// empty input, non-numeric prefixes, unknown suffixes, negative +// values, or int64 overflow. +// +// The longest known suffix wins ("kb" before "k"), so "1kb" is 1024 +// and not 1 followed by an unparsed "kb". +func parseSizeBytes(v string) (int64, error) { + if v == "" { + return 0, fmt.Errorf("empty size value") + } + s := strings.ToLower(v) + mult := int64(1) + switch { + case strings.HasSuffix(s, "kb"): + mult, s = 1<<10, s[:len(s)-2] + case strings.HasSuffix(s, "mb"): + mult, s = 1<<20, s[:len(s)-2] + case strings.HasSuffix(s, "gb"): + mult, s = 1<<30, s[:len(s)-2] + case strings.HasSuffix(s, "tb"): + mult, s = 1<<40, s[:len(s)-2] + case strings.HasSuffix(s, "k"): + mult, s = 1<<10, s[:len(s)-1] + case strings.HasSuffix(s, "m"): + mult, s = 1<<20, s[:len(s)-1] + case strings.HasSuffix(s, "g"): + mult, s = 1<<30, s[:len(s)-1] + case strings.HasSuffix(s, "t"): + mult, s = 1<<40, s[:len(s)-1] + } + if s == "" { + return 0, fmt.Errorf("no number before size suffix in %q", v) + } + n, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, fmt.Errorf("invalid number %q: %v", s, err) + } + if n < 0 { + return 0, fmt.Errorf("size %q must be non-negative", v) + } + if mult > 1 && n > 0 && n > (1<<62)/mult { + return 0, fmt.Errorf("size %q overflows int64", v) + } + return n * mult, nil +} + +// validateSenderId enforces the same character set the Java client +// allows for sender_id: ASCII letters, digits, '-', '_'. Matches +// Sender.java validateSenderId (no '.', no path separators, no +// spaces) and the connect-string spec at §Store-and-forward "Allowed +// characters: letters, digits, `_`, `-`". The value is used as a path +// segment under sf_dir; '.' is excluded to keep slot names stable +// across filesystems and avoid '..' surprises. +func validateSenderId(id string) error { + if id == "" { + return NewInvalidConfigStrError("sender_id must not be empty") + } + for i := 0; i < len(id); i++ { + c := id[i] + ok := (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '-' || c == '_' + if !ok { + return NewInvalidConfigStrError( + "sender_id contains invalid character: %q (allowed: letters, digits, _ -)", + string(c)) + } + } + return nil +} + func parseConfigStr(conf string) (configData, error) { var ( key = &strings.Builder{} @@ -296,7 +748,22 @@ func parseConfigStr(conf string) (configData, error) { return result, NewInvalidConfigStrError("empty value for key %q", key) } - result.KeyValuePairs[key.String()] = value.String() + // Reject duplicate keys (case-sensitive) for parity with Rust and + // the per-field checks in Java; otherwise dups would silently LWW. + // `addr` is the documented exception: the failover spec (§1) + // allows `addr=h1;addr=h2` as an alternative spelling of + // `addr=h1,h2`. Both forms accumulate into a single + // comma-joined value so downstream parsers see one shape. + keyStr := key.String() + if existing, exists := result.KeyValuePairs[keyStr]; exists { + if keyStr == "addr" { + result.KeyValuePairs[keyStr] = existing + "," + value.String() + } else { + return result, NewInvalidConfigStrError("duplicate key %q", keyStr) + } + } else { + result.KeyValuePairs[keyStr] = value.String() + } key.Reset() value.Reset() diff --git a/conf_test.go b/conf_test.go index 1e9c5733..803a2a5d 100644 --- a/conf_test.go +++ b/conf_test.go @@ -25,7 +25,9 @@ package questdb_test import ( + "context" "fmt" + "io" "testing" "time" @@ -265,6 +267,31 @@ func TestParserHappyCases(t *testing.T) { }, }, }, + { + // failover.md §1: `addr=h1;addr=h2` is an alternative + // spelling of `addr=h1,h2`. The parser MUST accumulate + // both forms into a single comma-joined value. + name: "ws addr accumulates across repeated keys", + config: "ws::addr=h1:9000;addr=h2:9000;addr=h3:9000;", + expected: qdb.ConfigData{ + Schema: "ws", + KeyValuePairs: map[string]string{ + "addr": "h1:9000,h2:9000,h3:9000", + }, + }, + }, + { + // Comma-form already parses today; accumulator must not + // double-comma when mixing with repeated keys. + name: "ws addr accumulates mixed comma and repeated forms", + config: "ws::addr=h1:9000,h2:9000;addr=h3:9000;", + expected: qdb.ConfigData{ + Schema: "ws", + KeyValuePairs: map[string]string{ + "addr": "h1:9000,h2:9000,h3:9000", + }, + }, + }, } for _, tc := range testCases { @@ -308,6 +335,11 @@ func TestParserPathologicalCases(t *testing.T) { config: "http::addr=localhost:9000;username=test;password=pass;word", expectedErrMsgContains: "unexpected end of", }, + { + name: "duplicate on_server_error", + config: "ws::addr=localhost:9000;on_server_error=auto;on_server_error=halt;", + expectedErrMsgContains: `duplicate key \"on_server_error\"`, + }, } for _, tc := range testCases { @@ -321,6 +353,19 @@ func TestParserPathologicalCases(t *testing.T) { } } +func TestParserKeysAreCaseSensitive(t *testing.T) { + // Same key bytes in different case are distinct, matching the Rust + // client. The lowercase `addr` is recognized; the uppercase `ADDR` + // is parsed but later rejected as an unsupported option. + parsed, err := qdb.ParseConfigStr("http::addr=localhost:9000;ADDR=localhost:9001;") + assert.NoError(t, err) + assert.Equal(t, "localhost:9000", parsed.KeyValuePairs["addr"]) + assert.Equal(t, "localhost:9001", parsed.KeyValuePairs["ADDR"]) + + _, err = qdb.ConfFromStr("http::addr=localhost:9000;ADDR=localhost:9001;") + assert.ErrorContains(t, err, "unsupported option") +} + type configTestCase struct { name string config string @@ -374,6 +419,15 @@ func TestHappyCasesFromConf(t *testing.T) { qdb.WithMaxBufferSize(maxBufSize), }, }, + { + name: "max_name_len", + config: fmt.Sprintf("http::addr=%s;max_name_len=64;", addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithHttp(), + qdb.WithAddress(addr), + qdb.WithFileNameLimit(64), + }, + }, { name: "with tls", config: fmt.Sprintf("tcp::addr=%s;tls_verify=on;", @@ -514,15 +568,18 @@ func TestHappyCasesFromConf(t *testing.T) { }, }, { - name: "ws with auto_flush and retry_timeout", - config: fmt.Sprintf("ws::addr=%s;auto_flush_rows=100;auto_flush_interval=500;retry_timeout=%d;", - addr, retryTimeout.Milliseconds()), + // retry_timeout is intentionally NOT paired with ws here: the + // parser maps it to WithRetryTimeout for any schema, but the + // QWP sanitizer rejects it (see TestQwpSanitizeRejectsRetryTimeout), + // so a ws connect string carrying it never reaches a sender. + name: "ws with auto_flush", + config: fmt.Sprintf("ws::addr=%s;auto_flush_rows=100;auto_flush_interval=500;", + addr), expectedOpts: []qdb.LineSenderOption{ qdb.WithQwp(), qdb.WithAddress(addr), qdb.WithAutoFlushRows(100), qdb.WithAutoFlushInterval(500 * time.Millisecond), - qdb.WithRetryTimeout(retryTimeout), }, }, { @@ -561,6 +618,57 @@ func TestHappyCasesFromConf(t *testing.T) { qdb.WithGorilla(true), }, }, + { + name: "ws with auth_timeout_ms", + config: fmt.Sprintf("ws::addr=%s;auth_timeout_ms=7000;", addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithQwp(), + qdb.WithAddress(addr), + qdb.WithAuthTimeout(7 * time.Second), + }, + }, + { + name: "ws with target=primary", + config: fmt.Sprintf("ws::addr=%s;target=primary;", addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithQwp(), + qdb.WithAddress(addr), + qdb.WithTarget(qdb.QwpTargetPrimary), + }, + }, + { + name: "ws with zone", + config: fmt.Sprintf("ws::addr=%s;zone=az-1;", addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithQwp(), + qdb.WithAddress(addr), + qdb.WithZone("az-1"), + }, + }, + { + name: "ws with on_server_error", + config: fmt.Sprintf("ws::addr=%s;on_server_error=halt;", addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithQwp(), + qdb.WithAddress(addr), + qdb.WithServerErrorPolicy(qdb.PolicyHalt), + }, + }, + { + name: "ws with sf cursor knobs", + config: fmt.Sprintf( + "ws::addr=%s;sf_dir=/tmp/sf;sf_durability=memory;sf_append_deadline_millis=20000;drain_orphans=on;max_background_drainers=2;", + addr), + expectedOpts: []qdb.LineSenderOption{ + qdb.WithQwp(), + qdb.WithAddress(addr), + qdb.WithSfDir("/tmp/sf"), + qdb.WithSfDurability("memory"), + qdb.WithSfAppendDeadline(20 * time.Second), + qdb.WithDrainOrphans(true), + qdb.WithMaxBackgroundDrainers(2), + }, + }, } for _, tc := range testCases { @@ -643,14 +751,13 @@ func TestPathologicalCasesFromConf(t *testing.T) { expectedErrMsgContains: "in_flight_window is only supported for QWP senders", }, { - name: "close_timeout on TCP", + // close_timeout was a Go-only legacy key; the Java client + // never accepted it. Removed in favour of the spec- + // aligned close_flush_timeout_millis. The parser now + // rejects regardless of schema, with a migration hint. + name: "close_timeout rejected with migration hint", config: "tcp::addr=localhost:1111;close_timeout=1000;", - expectedErrMsgContains: "close_timeout is only supported for QWP senders", - }, - { - name: "max_schemas_per_connection on HTTP", - config: "http::addr=localhost:1111;max_schemas_per_connection=8;", - expectedErrMsgContains: "max_schemas_per_connection is only supported for QWP senders", + expectedErrMsgContains: "close_timeout is no longer supported", }, { name: "gorilla on TCP", @@ -682,6 +789,36 @@ func TestPathologicalCasesFromConf(t *testing.T) { config: "http::addr=localhost:9000;username=;", expectedErrMsgContains: "empty value for key", }, + { + name: "auth_timeout_ms on HTTP", + config: "http::addr=localhost:9000;auth_timeout_ms=5000;", + expectedErrMsgContains: "auth_timeout_ms is only supported for QWP senders", + }, + { + name: "zone on TCP", + config: "tcp::addr=localhost:9009;zone=eu-west-1a;", + expectedErrMsgContains: "zone is only supported for QWP senders", + }, + { + name: "target on HTTP", + config: "http::addr=localhost:9000;target=primary;", + expectedErrMsgContains: "target is only supported for QWP senders", + }, + { + name: "invalid target value", + config: "ws::addr=localhost:9000;target=foo;", + expectedErrMsgContains: "invalid target", + }, + { + name: "non-positive auth_timeout_ms", + config: "ws::addr=localhost:9000;auth_timeout_ms=0;", + expectedErrMsgContains: "auth_timeout_ms", + }, + { + name: "non-numeric auth_timeout_ms", + config: "ws::addr=localhost:9000;auth_timeout_ms=fast;", + expectedErrMsgContains: "auth_timeout_ms", + }, } for _, tc := range testCases { @@ -691,3 +828,196 @@ func TestPathologicalCasesFromConf(t *testing.T) { }) } } + +// TestQwpFailoverSanitizeErrors covers sanitizer-level rejections for +// failover-related config: multi-host on HTTP/TCP (not yet wired up +// in those transports) and malformed QWP endpoint lists. +func TestQwpFailoverSanitizeErrors(t *testing.T) { + cases := []struct { + name string + config string + errMsg string + }{ + { + name: "multi-host addr on HTTP", + config: "http::addr=localhost:9000,localhost:9001;", + errMsg: "multi-host addr is not supported for HTTP", + }, + { + name: "multi-host addr on HTTP via repeated keys", + config: "http::addr=localhost:9000;addr=localhost:9001;", + errMsg: "multi-host addr is not supported for HTTP", + }, + { + name: "multi-host addr on TCP", + config: "tcp::addr=localhost:9009,localhost:9010;", + errMsg: "multi-host addr is not supported for TCP", + }, + { + name: "trailing comma in addr", + config: "ws::addr=localhost:9000,;", + errMsg: "empty entry in addr list", + }, + { + name: "double comma in addr", + config: "ws::addr=h1:9000,,h2:9000;", + errMsg: "empty entry in addr list", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + c, err := qdb.ConfFromStr(tc.config) + assert.NoError(t, err) + assert.ErrorContains(t, qdb.SanitizeConf(c), tc.errMsg) + }) + } +} + +// TestQwpSanitizeRejectsRetryTimeout pins that retry_timeout, though +// the parser accepts it for any schema, is rejected by the QWP +// sanitizer: it is an HTTP-ILP retry knob with no QWP analogue +// (reconnect_max_duration_millis governs the per-outage budget +// instead). Guards the parser happy-case in TestHappyCasesFromConf, +// which deliberately omits the ws+retry_timeout pairing. +func TestQwpSanitizeRejectsRetryTimeout(t *testing.T) { + c, err := qdb.ConfFromStr("ws::addr=localhost:9000;retry_timeout=5000;") + assert.NoError(t, err, "parser accepts retry_timeout for any schema") + assert.ErrorContains(t, qdb.SanitizeConf(c), + "retry_timeout is not supported for QWP") +} + +// TestQwpFailoverConfKeys covers the connect-string keys mandated by +// failover.md §1 (addr multi-host, auth_timeout_ms, zone, target). +// The keys are parsed but not yet consumed by the SF reconnect loop — +// these tests pin down the parser-and-sanitizer surface so the +// downstream wire-up phases can rely on it. +func TestQwpFailoverConfKeys(t *testing.T) { + parseSanitize := func(t *testing.T, conf string) *qdb.LineSenderConfig { + t.Helper() + c, err := qdb.ConfFromStr(conf) + assert.NoError(t, err) + assert.NoError(t, qdb.SanitizeConf(c)) + return c + } + + t.Run("single host populates endpoints[0]", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=questdb.local:9000;") + assert.Equal(t, []string{"questdb.local:9000"}, qdb.ConfigEndpoints(c)) + }) + + t.Run("comma-form addr produces ordered endpoints", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=h1:9000,h2:9000,h3:9000;") + assert.Equal(t, + []string{"h1:9000", "h2:9000", "h3:9000"}, + qdb.ConfigEndpoints(c)) + }) + + t.Run("repeated-key addr produces ordered endpoints", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=h1:9000;addr=h2:9000;addr=h3:9000;") + assert.Equal(t, + []string{"h1:9000", "h2:9000", "h3:9000"}, + qdb.ConfigEndpoints(c)) + }) + + t.Run("missing port defaults to 9000", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=h1,h2:9001,h3;") + assert.Equal(t, + []string{"h1:9000", "h2:9001", "h3:9000"}, + qdb.ConfigEndpoints(c)) + }) + + t.Run("IPv6 bracketed host", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=[::1]:9000,[fe80::1]:9001;") + assert.Equal(t, + []string{"[::1]:9000", "[fe80::1]:9001"}, + qdb.ConfigEndpoints(c)) + }) + + t.Run("auth_timeout_ms default 15s", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;") + assert.Equal(t, 15_000, qdb.ConfigAuthTimeoutMs(c)) + }) + + t.Run("auth_timeout_ms explicit", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;auth_timeout_ms=5000;") + assert.Equal(t, 5_000, qdb.ConfigAuthTimeoutMs(c)) + }) + + t.Run("zone is silently accepted on QWP ingress", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;zone=eu-west-1a;") + assert.Equal(t, "eu-west-1a", qdb.ConfigZone(c)) + }) + + t.Run("target=any (default)", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;") + assert.Equal(t, "any", qdb.ConfigTarget(c)) + }) + + t.Run("target=primary", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;target=primary;") + assert.Equal(t, "primary", qdb.ConfigTarget(c)) + }) + + t.Run("target=replica", func(t *testing.T) { + c := parseSanitize(t, "ws::addr=localhost:9000;target=replica;") + assert.Equal(t, "replica", qdb.ConfigTarget(c)) + }) + + t.Run("wss tls-mode preserved with multi-host", func(t *testing.T) { + c := parseSanitize(t, "wss::addr=h1:9000,h2:9000;zone=dc-a;target=primary;auth_timeout_ms=8000;") + assert.Equal(t, []string{"h1:9000", "h2:9000"}, qdb.ConfigEndpoints(c)) + assert.Equal(t, "dc-a", qdb.ConfigZone(c)) + assert.Equal(t, "primary", qdb.ConfigTarget(c)) + assert.Equal(t, 8_000, qdb.ConfigAuthTimeoutMs(c)) + }) +} + +// TestQwpOnlyOptionsRejectedOnHttpAndTcp pins parity between the +// connect-string parser (which rejects each QWP-only key on http/tcp +// schemas with ` is only supported for QWP senders`) and the +// option path. Without this gate, e.g. `WithHttp() + WithSfDir(...)` +// silently constructs an HTTP sender that ignores the setting. +func TestQwpOnlyOptionsRejectedOnHttpAndTcp(t *testing.T) { + cases := []struct { + name string + opt qdb.LineSenderOption + errMsg string + }{ + {"sf_dir", qdb.WithSfDir("/tmp/sf"), "sf_dir"}, + {"sender_id", qdb.WithSenderId("ingest-1"), "sender_id"}, + {"sf_max_bytes", qdb.WithSfMaxBytes(1 << 20), "sf_max_bytes"}, + {"sf_max_total_bytes", qdb.WithSfMaxTotalBytes(1 << 30), "sf_max_total_bytes"}, + {"sf_durability", qdb.WithSfDurability("memory"), "sf_durability"}, + {"sf_append_deadline", qdb.WithSfAppendDeadline(10 * time.Second), "sf_append_deadline_millis"}, + {"drain_orphans", qdb.WithDrainOrphans(true), "drain_orphans"}, + {"max_background_drainers", qdb.WithMaxBackgroundDrainers(2), "max_background_drainers"}, + {"reconnect_policy", qdb.WithReconnectPolicy(time.Minute, 100*time.Millisecond, time.Second), "reconnect_*"}, + {"initial_connect_mode", qdb.WithInitialConnectMode(qdb.InitialConnectSync), "initial_connect_retry"}, + {"initial_connect_retry", qdb.WithInitialConnectRetry(true), "initial_connect_retry"}, + {"close_flush_timeout", qdb.WithCloseFlushTimeout(5 * time.Second), "close_flush_timeout_millis"}, + {"close_timeout_alias", qdb.WithCloseTimeout(5 * time.Second), "close_flush_timeout_millis"}, + {"gorilla", qdb.WithGorilla(false), "gorilla"}, + {"in_flight_window", qdb.WithInFlightWindow(8), "in_flight_window"}, + {"auth_timeout", qdb.WithAuthTimeout(5 * time.Second), "auth_timeout_ms"}, + {"zone", qdb.WithZone("eu-west-1a"), "zone"}, + {"target", qdb.WithTarget(qdb.QwpTargetPrimary), "target"}, + {"qwp_dump_writer", qdb.WithQwpDumpWriter(io.Discard), "QWP dump writer"}, + {"error_handler", qdb.WithErrorHandler(func(*qdb.SenderError) {}), "server-error API"}, + {"error_inbox_capacity", qdb.WithErrorInboxCapacity(64), "server-error API"}, + {"server_error_policy", qdb.WithServerErrorPolicy(qdb.PolicyHalt), "server-error API"}, + } + for _, transport := range []struct { + name string + ctor qdb.LineSenderOption + }{ + {"http", qdb.WithHttp()}, + {"tcp", qdb.WithTcp()}, + } { + for _, tc := range cases { + t.Run(transport.name+"/"+tc.name, func(t *testing.T) { + _, err := qdb.NewLineSender(context.Background(), transport.ctor, tc.opt) + assert.ErrorContains(t, err, tc.errMsg) + }) + } + } +} diff --git a/examples.manifest.yaml b/examples.manifest.yaml index 97f713c4..7aec4f97 100644 --- a/examples.manifest.yaml +++ b/examples.manifest.yaml @@ -37,3 +37,28 @@ Go client library [docs](https://pkg.go.dev/github.com/questdb/go-questdb-client/v4) and [repo](https://github.com/questdb/go-questdb-client). conf: http::addr=localhost:9000; +- name: qwp-ingest + lang: go + path: examples/qwp/basic/main.go + header: |- + Go client library [docs](https://pkg.go.dev/github.com/questdb/go-questdb-client/v4) + and [repo](https://github.com/questdb/go-questdb-client). + addr: + host: localhost + port: 9000 +- name: qwp-query + lang: go + path: examples/qwp/basic-query/main.go + header: |- + Go client library [docs](https://pkg.go.dev/github.com/questdb/go-questdb-client/v4) + and [repo](https://github.com/questdb/go-questdb-client). + addr: + host: localhost + port: 9000 +- name: qwp-sf + lang: go + path: examples/qwp/sf/main.go + header: |- + Go client library [docs](https://pkg.go.dev/github.com/questdb/go-questdb-client/v4) + and [repo](https://github.com/questdb/go-questdb-client). + conf: ws::addr=localhost:9000; diff --git a/examples/qwp/basic-query/main.go b/examples/qwp/basic-query/main.go new file mode 100644 index 00000000..78fbb60f --- /dev/null +++ b/examples/qwp/basic-query/main.go @@ -0,0 +1,148 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ +package main + +import ( + "context" + "fmt" + "log" + "strings" + "time" + + qdb "github.com/questdb/go-questdb-client/v4" +) + +const ( + tableName = "qwp_query_example" + rowCount = 1000 +) + +func main() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + client, err := qdb.NewQwpQueryClient(ctx, + qdb.WithQwpQueryAddress("localhost:9000"), + ) + if err != nil { + log.Fatalf("connect: %v", err) + } + defer func() { + if err := client.Close(ctx); err != nil { + log.Printf("close: %v", err) + } + }() + + if _, err := client.Exec(ctx, fmt.Sprintf("DROP TABLE IF EXISTS '%s'", tableName)); err != nil { + log.Fatalf("drop: %v", err) + } + createSQL := fmt.Sprintf( + "CREATE TABLE '%s' (ts TIMESTAMP, v LONG) TIMESTAMP(ts)", + tableName) + if _, err := client.Exec(ctx, createSQL); err != nil { + log.Fatalf("create: %v", err) + } + + insertSQL := buildBulkInsert(tableName, rowCount) + res, err := client.Exec(ctx, insertSQL) + if err != nil { + log.Fatalf("insert: %v", err) + } + fmt.Printf("inserted %d rows\n", res.RowsAffected) + + expected := expectedSum(rowCount) + fmt.Printf("expected sum: %d\n", expected) + fmt.Printf("per-row sum: %d\n", sumPerRow(ctx, client)) + fmt.Printf("bulk sum: %d\n", sumBulk(ctx, client)) +} + +// sumPerRow demonstrates the zero-allocation, per-row idiom. +// +// QwpColumn caches the column's layout pointer once per batch, so every +// Int64(r) call reads straight out of the QWP buffer — no intermediate +// slice. Best for ad-hoc consumers and when you also need per-row +// branching (null checks, mixed-column row builders). +func sumPerRow(ctx context.Context, client *qdb.QwpQueryClient) int64 { + q := client.Query(ctx, fmt.Sprintf("SELECT ts, v FROM '%s'", tableName)) + defer q.Close() + + var sum int64 + for batch, err := range q.Batches() { + if err != nil { + log.Fatalf("per-row query: %v", err) + } + vCol := batch.Column(1) // column 1 is `v` (LONG) + n := vCol.RowCount() + for r := 0; r < n; r++ { + sum += vCol.Int64(r) + } + } + return sum +} + +// sumBulk demonstrates the bulk-decode idiom for a tight column sweep. +// +// Int64Range decodes a row range into a caller-owned []int64 in one +// shot. On a no-null column it lowers to a single memmove out of the +// QWP buffer, after which the inner sum is a branch-free range loop the +// compiler can vectorize. Reuse the buffer across batches with [:0] — +// allocation happens once for the whole query. +func sumBulk(ctx context.Context, client *qdb.QwpQueryClient) int64 { + q := client.Query(ctx, fmt.Sprintf("SELECT ts, v FROM '%s'", tableName)) + defer q.Close() + + var ( + sum int64 + buf = make([]int64, 0, rowCount) + ) + for batch, err := range q.Batches() { + if err != nil { + log.Fatalf("bulk query: %v", err) + } + buf = batch.Column(1).Int64Range(0, batch.RowCount(), buf[:0]) + for _, v := range buf { + sum += v + } + } + return sum +} + +func buildBulkInsert(table string, n int) string { + base := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + var sb strings.Builder + fmt.Fprintf(&sb, "INSERT INTO '%s' (ts, v) VALUES ", table) + for i := 0; i < n; i++ { + if i > 0 { + sb.WriteByte(',') + } + // QuestDB TIMESTAMP literals are microseconds since epoch. + ts := base.Add(time.Duration(i) * time.Second).UnixMicro() + fmt.Fprintf(&sb, "(%d,%d)", ts, int64(i)) + } + return sb.String() +} + +func expectedSum(n int) int64 { + return int64(n) * int64(n-1) / 2 +} diff --git a/examples/qwp/basic/main.go b/examples/qwp/basic/main.go new file mode 100644 index 00000000..1fec1811 --- /dev/null +++ b/examples/qwp/basic/main.go @@ -0,0 +1,94 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +// Demonstrates the minimum correct QWP (WebSocket) ingestion idiom for a +// single-host application without failover. +// +// QWP ingestion is asynchronous: the error returned by At/AtNow/Flush is the +// local, latched error (bad value, buffer state, backpressure). Server-side +// rejections (schema mismatch, parse error, ...) arrive out of band on the +// SenderErrorHandler, NOT from the Flush that sent the data. Registering a +// handler is therefore part of the baseline idiom, not an advanced option. +package main + +import ( + "context" + "log" + "time" + + qdb "github.com/questdb/go-questdb-client/v4" +) + +func main() { + ctx := context.TODO() + + // WithQwp() selects the QWP binary protocol over a plain WebSocket + // (use qdb.WithTls() for wss). A LineSender is not safe for + // concurrent use: create one per goroutine. + sender, err := qdb.NewLineSender(ctx, + qdb.WithQwp(), + qdb.WithAddress("localhost:9000"), + qdb.WithErrorHandler(func(e *qdb.SenderError) { + // Dead-letter / alert here. This runs on a dedicated + // goroutine, never the producer goroutine. + log.Printf("server rejected fsn=[%d,%d] table=%s category=%s: %s", + e.FromFsn, e.ToFsn, e.TableName, e.Category, e.ServerMessage) + }), + ) + if err != nil { + log.Fatal(err) + } + defer func() { + // Close flushes and drains, but a failed close can mean + // unacked data was not delivered. Always check it. + if err := sender.Close(ctx); err != nil { + log.Fatal(err) + } + }() + + tradedTs, _ := time.Parse(time.RFC3339, "2022-08-06T15:04:05.123456Z") + for i := 0; i < 1000; i++ { + // Call order is fixed: Table, then Symbol(s), then columns, + // then At/AtNow. A latched fluent error surfaces here. + err := sender. + Table("trades"). + Symbol("symbol", "ETH-USD"). + Symbol("side", "sell"). + Float64Column("price", 2615.54). + Float64Column("amount", 0.00044). + At(ctx, tradedTs) + if err != nil { + log.Fatal(err) + } + } + + // Publish everything buffered so far. Flush returns once the batch + // is published to the cursor engine; it does NOT wait for the + // server ACK (rejections arrive on the handler above). Batch many + // rows per Flush rather than flushing per row. For server-ack + // confirmation, use FlushAndGetSequence paired with AwaitAckedFsn. + if err := sender.Flush(ctx); err != nil { + log.Fatal(err) + } +} diff --git a/examples/qwp/sf/main.go b/examples/qwp/sf/main.go new file mode 100644 index 00000000..4d6f28cb --- /dev/null +++ b/examples/qwp/sf/main.go @@ -0,0 +1,93 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +// Demonstrates the QWP store-and-forward (SF) durability mode. +// Outgoing batches are persisted to mmap'd disk segments before they +// leave the wire; the I/O loop replays from disk transparently on +// reconnect or process restart. +package main + +import ( + "context" + "log" + "time" + + qdb "github.com/questdb/go-questdb-client/v4" +) + +func main() { + ctx := context.TODO() + + // sf_dir is the SF group root — one or more sender instances can + // share it, each living under //. + // sender_id : per-sender slot name (default "default") + // sf_max_bytes : per-segment file size (default 4 MiB) + // sf_max_total_bytes : disk cap for THIS sender's slot (default 10 GiB) + // close_flush_timeout_millis : how long Close() waits for ACKs + // before proceeding (default 5000; + // 0 / -1 → fast close, leave on disk) + // drain_orphans : opt in to draining sibling slots left behind + // by other senders that crashed + conf := "ws::addr=localhost:9000;" + + "sf_dir=/var/lib/questdb-sf;" + + "sender_id=trades-feed;" + + "sf_max_bytes=8388608;" + + "sf_max_total_bytes=1073741824;" + // 1 GiB + "close_flush_timeout_millis=5000;" + + "drain_orphans=on;" + sender, err := qdb.LineSenderFromConf(ctx, conf) + if err != nil { + log.Fatal(err) + } + defer func() { + // Close() drains the engine (waiting up to + // close_flush_timeout_millis for the server to ACK every + // frame) and releases the slot lock. Anything still on disk + // will be replayed by the next process to start with the + // same sf_dir + sender_id. + if err := sender.Close(ctx); err != nil { + log.Fatal(err) + } + }() + + tradedTs, _ := time.Parse(time.RFC3339, "2022-08-06T15:04:05.123456Z") + for i := 0; i < 1000; i++ { + err := sender. + Table("trades"). + Symbol("symbol", "ETH-USD"). + Symbol("side", "sell"). + Float64Column("price", 2615.54). + Float64Column("amount", 0.00044). + At(ctx, tradedTs) + if err != nil { + // In SF mode, At() can block briefly on disk-full + // backpressure when sf_max_total_bytes is reached and + // the wire path hasn't drained the cap. The error here + // surfaces the deadline expiry — investigate the wire + // path (server reachability, server slow, etc.) rather + // than retrying tighter. + log.Fatal(err) + } + } +} diff --git a/export_test.go b/export_test.go index ab838899..ac2115bb 100644 --- a/export_test.go +++ b/export_test.go @@ -24,6 +24,8 @@ package questdb +import "github.com/coder/websocket" + type ( Buffer = buffer ConfigData = configData @@ -32,6 +34,20 @@ type ( SenderType = senderType ) +// QwpSfClassify exposes the internal status-byte → Category mapping +// for cross-language regression tests in the questdb_test package. +func QwpSfClassify(status QwpStatusCode) Category { return qwpSfClassify(status) } + +// QwpSfDefaultPolicyFor exposes the spec-default Category → Policy +// mapping for unit tests. +func QwpSfDefaultPolicyFor(c Category) Policy { return qwpSfDefaultPolicyFor(c) } + +// QwpSfIsTerminalCloseCode exposes the WS terminal close-code +// classifier for unit tests. +func QwpSfIsTerminalCloseCode(code websocket.StatusCode) bool { + return qwpSfIsTerminalCloseCode(code) +} + var ( GlobalTransport = globalTransport NoSenderType SenderType = noSenderType @@ -50,10 +66,30 @@ func ParseConfigStr(conf string) (configData, error) { return parseConfigStr(conf) } +// ConfFromStr parses a connect string into a *LineSenderConfig. The +// returned config has NOT been sanitized — call SanitizeConf for the +// post-sanitize shape (defaults applied, endpoints back-filled from +// address, transport-specific validation run). func ConfFromStr(conf string) (*LineSenderConfig, error) { return confFromStr(conf) } +// SanitizeConf dispatches to the per-transport sanitizer. Exposed for +// tests that need to apply post-parse defaults (e.g. authTimeoutMs, +// QWP endpoints) without going through the full newLineSender path +// (which would attempt a dial). +func SanitizeConf(c *LineSenderConfig) error { + switch c.senderType { + case tcpSenderType: + return sanitizeTcpConf(c) + case httpSenderType: + return sanitizeHttpConf(c) + case qwpSenderType: + return sanitizeQwpConf(c) + } + return nil +} + func Messages(s LineSender) []byte { if ps, ok := s.(*pooledSender); ok { s = ps.wrapped @@ -174,6 +210,33 @@ func NewLineSenderConfig(t SenderType) *LineSenderConfig { return newLineSenderConfig(t) } +// ConfigEndpoints returns the multi-host failover list parsed by +// sanitizeQwpConf. Each entry is rendered as host:port (IPv6 hosts +// are bracketed) so tests can compare against literals. Returns nil +// for non-QWP senders. +func ConfigEndpoints(c *LineSenderConfig) []string { + if c == nil || len(c.endpoints) == 0 { + return nil + } + out := make([]string, len(c.endpoints)) + for i, e := range c.endpoints { + out[i] = e.String() + } + return out +} + +// ConfigAuthTimeoutMs returns the effective auth_timeout_ms after +// sanitization (default 15000). +func ConfigAuthTimeoutMs(c *LineSenderConfig) int { return c.authTimeoutMs } + +// ConfigZone returns the parsed zone= value (silently stored but +// unused on SF ingress). +func ConfigZone(c *LineSenderConfig) string { return c.zone } + +// ConfigTarget returns the parsed target= value as a string +// (any/primary/replica). +func ConfigTarget(c *LineSenderConfig) string { return c.target.String() } + func SetLittleEndian(littleEndian bool) { isLittleEndian = littleEndian } diff --git a/go.mod b/go.mod index 114fe2a7..82ea1d4c 100644 --- a/go.mod +++ b/go.mod @@ -2,11 +2,12 @@ module github.com/questdb/go-questdb-client/v4 go 1.23 -toolchain go1.24.4 - require ( + github.com/coder/websocket v1.8.14 + github.com/klauspost/compress v1.18.4 github.com/stretchr/testify v1.9.0 github.com/testcontainers/testcontainers-go v0.26.0 + golang.org/x/sys v0.16.0 ) require ( @@ -15,7 +16,6 @@ require ( github.com/Microsoft/go-winio v0.6.1 // indirect github.com/Microsoft/hcsshim v0.11.4 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect - github.com/coder/websocket v1.8.14 // indirect github.com/containerd/containerd v1.7.12 // indirect github.com/containerd/log v0.1.0 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect @@ -28,7 +28,6 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/klauspost/compress v1.17.0 // indirect github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/moby/patternmatcher v0.6.0 // indirect @@ -49,7 +48,6 @@ require ( github.com/yusufpapurcu/wmi v1.2.3 // indirect golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 // indirect golang.org/x/mod v0.13.0 // indirect - golang.org/x/sys v0.16.0 // indirect golang.org/x/tools v0.14.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97 // indirect google.golang.org/grpc v1.58.3 // indirect diff --git a/go.sum b/go.sum index 25e41aa7..27375ab0 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,7 @@ dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= @@ -24,6 +25,7 @@ github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoY github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -58,10 +60,11 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM= -github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= @@ -96,6 +99,7 @@ github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:Om github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b h1:0LFwY6Q3gMACTjAbMZBjXAqTOzOwFaj2Ld6cjeQ7Rig= github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/rogpeppe/go-internal v1.8.1 h1:geMPLpDpQOgVyCg5z5GoRwLHepNdb71NXb67XFkP+Eg= +github.com/rogpeppe/go-internal v1.8.1/go.mod h1:JeRgkft04UBgHMgCIwADu4Pn6Mtm5d4nPKWu0nJ5d+o= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= @@ -147,10 +151,12 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= +golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -176,7 +182,9 @@ golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9sn golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -198,8 +206,10 @@ google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.5.0 h1:Ljk6PdHdOhAb5aDMWXjDLMMhph+BpztA4v1QdqEW2eY= +gotest.tools/v3 v3.5.0/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= diff --git a/http_sender_test.go b/http_sender_test.go index 16f2993e..bd15fa94 100644 --- a/http_sender_test.go +++ b/http_sender_test.go @@ -130,14 +130,17 @@ func TestHttpPathologicalCasesFromConf(t *testing.T) { expectedErr: "both basic and token", }, { + // Size-typed keys now go through parseSizeBytes, which + // rejects negatives at parse time with a more specific + // message than the old validateConf "is negative" check. name: "negative init_buf_size", config: "http::init_buf_size=-1;", - expectedErr: "initial buffer size is negative", + expectedErr: "must be non-negative", }, { name: "negative max_buf_size", config: "http::max_buf_size=-1;", - expectedErr: "max buffer size is negative", + expectedErr: "must be non-negative", }, { name: "negative retry timeout", @@ -164,6 +167,11 @@ func TestHttpPathologicalCasesFromConf(t *testing.T) { config: "http::auto_flush_interval=-1;", expectedErr: "auto flush interval is negative", }, + { + name: "max_name_len below minimum", + config: "http::max_name_len=15;", + expectedErr: "max_name_len must be at least 16 bytes", + }, { name: "schema is case-sensitive", config: "hTtp::addr=localhost:1234;", @@ -194,9 +202,11 @@ func TestHttpPathologicalCasesFromEnv(t *testing.T) { expectedErr: "both basic and token", }, { + // See TestHttpPathologicalCasesFromConf above — size-typed + // keys now error at parse time with a different message. name: "negative max_buf_size", config: "http::max_buf_size=-1;", - expectedErr: "max buffer size is negative", + expectedErr: "must be non-negative", }, { name: "schema is case-sensitive", @@ -340,6 +350,7 @@ func TestRetryOn500(t *testing.T) { ctx, qdb.WithHttp(), qdb.WithAddress(srv.Addr()), + qdb.WithProtocolVersion(qdb.ProtocolVersion1), qdb.WithRequestTimeout(10*time.Millisecond), qdb.WithRetryTimeout(50*time.Millisecond), ) @@ -367,6 +378,7 @@ func TestNoRetryOn400FromProxy(t *testing.T) { ctx, qdb.WithHttp(), qdb.WithAddress(srv.Addr()), + qdb.WithProtocolVersion(qdb.ProtocolVersion1), qdb.WithRequestTimeout(10*time.Millisecond), qdb.WithRetryTimeout(50*time.Millisecond), ) @@ -394,6 +406,7 @@ func TestNoRetryOn400FromServer(t *testing.T) { ctx, qdb.WithHttp(), qdb.WithAddress(srv.Addr()), + qdb.WithProtocolVersion(qdb.ProtocolVersion1), qdb.WithRequestTimeout(10*time.Millisecond), qdb.WithRetryTimeout(50*time.Millisecond), ) diff --git a/qwp_bench_test.go b/qwp_bench_test.go index 9dec38ff..ea380d5a 100644 --- a/qwp_bench_test.go +++ b/qwp_bench_test.go @@ -73,12 +73,11 @@ func BenchmarkQwpEncode(b *testing.B) { } symList := []string{"s0", "s1", "s2", "s3", "s4"} - const schemaId = 0 var enc qwpEncoder b.ResetTimer() for i := 0; i < b.N; i++ { - enc.encodeTableWithDeltaDict(tb, symList, -1, 4, qwpSchemaModeFull, schemaId) + enc.encodeTableWithDeltaDict(tb, symList, -1, 4) } } @@ -127,7 +126,7 @@ func BenchmarkQwpFlush(b *testing.B) { tb.commitRow() } - enc.encodeTableWithDeltaDict(tb, symList, -1, 2, qwpSchemaModeFull, 0) + enc.encodeTableWithDeltaDict(tb, symList, -1, 2) tb.reset() } } @@ -144,9 +143,6 @@ func qwpSteadyStateSetup() (*qwpLineSender, func()) { globalSymbols: make(map[string]int32), maxSentSymbolId: -1, batchMaxSymbolId: -1, - nextSchemaId: 0, - maxSentSchemaId: -1, - batchMaxSchemaId: -1, } s.globalSymbols["AAPL"] = 0 @@ -165,15 +161,12 @@ func qwpSteadyStateSetup() (*qwpLineSender, func()) { } } tables, _ := s.buildTableEncodeInfo() - s.encoders[0].encodeMultiTableWithDeltaDict( + s.encoder.encodeMultiTableWithDeltaDict( tables, s.globalSymbolList, s.maxSentSymbolId, s.batchMaxSymbolId, ) - if s.batchMaxSchemaId > s.maxSentSchemaId { - s.maxSentSchemaId = s.batchMaxSchemaId - } s.resetAfterFlush() } @@ -199,8 +192,14 @@ func BenchmarkQwpSenderSteadyState(b *testing.B) { // TestQwpSenderSteadyStateZeroAllocs pins the 0-allocs/op invariant // programmatically so the invariant survives refactors without a -// developer having to read the benchmark output. +// developer having to read the benchmark output. Only meaningful for +// non-race builds: race instrumentation forces some stack-allocatable +// values to escape and inflates allocs/op (see TestQwpSender +// SteadyStateNullsZeroAllocs for the variant that trips on this). func TestQwpSenderSteadyStateZeroAllocs(t *testing.T) { + if raceEnabled { + t.Skip("zero-alloc invariant does not hold under -race") + } _, iter := qwpSteadyStateSetup() if allocs := testing.AllocsPerRun(100, iter); allocs > 0 { t.Fatalf("steady-state allocs/op = %g, want 0", allocs) @@ -221,9 +220,6 @@ func qwpSteadyStateSetupWithNulls() (*qwpLineSender, func()) { globalSymbols: make(map[string]int32), maxSentSymbolId: -1, batchMaxSymbolId: -1, - nextSchemaId: 0, - maxSentSchemaId: -1, - batchMaxSchemaId: -1, } s.globalSymbols["AAPL"] = 0 @@ -246,15 +242,12 @@ func qwpSteadyStateSetupWithNulls() (*qwpLineSender, func()) { } } tables, _ := s.buildTableEncodeInfo() - s.encoders[0].encodeMultiTableWithDeltaDict( + s.encoder.encodeMultiTableWithDeltaDict( tables, s.globalSymbolList, s.maxSentSymbolId, s.batchMaxSymbolId, ) - if s.batchMaxSchemaId > s.maxSentSchemaId { - s.maxSentSchemaId = s.batchMaxSchemaId - } s.resetAfterFlush() } @@ -277,14 +270,100 @@ func BenchmarkQwpSenderSteadyStateNulls(b *testing.B) { } // TestQwpSenderSteadyStateNullsZeroAllocs pins the 0-allocs/op -// invariant for the null-mix variant. +// invariant for the null-mix variant. See sibling test for the -race +// caveat. func TestQwpSenderSteadyStateNullsZeroAllocs(t *testing.T) { + if raceEnabled { + t.Skip("zero-alloc invariant does not hold under -race") + } _, iter := qwpSteadyStateSetupWithNulls() if allocs := testing.AllocsPerRun(100, iter); allocs > 0 { t.Fatalf("steady-state-nulls allocs/op = %g, want 0", allocs) } } +// qwpSteadyStateSetupMixedCase mirrors qwpSteadyStateSetupWithNulls but +// gives every column a mixed-case name. The column index is keyed by the +// lowercase name, so each cursor miss — which the sparse null pattern +// guarantees — reaches the map lookup. That lookup stays allocation-free +// for mixed-case writers via the ASCII-fold cursor compare, the cursor +// resync on a map hit, and the memoized casing-variant alias keys. +// Without them strings.ToLower allocates a fresh lowercase key on every +// cursor-miss column, every row. +func qwpSteadyStateSetupMixedCase() (*qwpLineSender, func()) { + ctx := context.Background() + ts := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + + s := &qwpLineSender{ + tableBuffers: make(map[string]*qwpTableBuffer), + globalSymbols: make(map[string]int32), + maxSentSymbolId: -1, + batchMaxSymbolId: -1, + } + + s.globalSymbols["AAPL"] = 0 + s.globalSymbolList = append(s.globalSymbolList, "AAPL") + s.batchMaxSymbolId = 0 + + iter := func() { + for r := 0; r < 10; r++ { + b := s.Table("t").Symbol("Sym", "AAPL") + if r%3 != 0 { + b = b.Int64Column("Qty", int64(100+r)) + } + b = b.Float64Column("Price", 150.5+float64(r)) + if r%2 == 0 { + b = b.StringColumn("Note", "test") + } + b = b.BoolColumn("Active", r%2 == 0) + if err := b.At(ctx, ts.Add(time.Duration(r)*time.Microsecond)); err != nil { + panic(err) + } + } + tables, _ := s.buildTableEncodeInfo() + s.encoder.encodeMultiTableWithDeltaDict( + tables, + s.globalSymbolList, + s.maxSentSymbolId, + s.batchMaxSymbolId, + ) + s.resetAfterFlush() + } + + iter() + iter() + return s, iter +} + +// BenchmarkQwpSenderSteadyStateMixedCase is the mixed-case counterpart +// of BenchmarkQwpSenderSteadyStateNulls: the same sparse null pattern, +// but the column names carry uppercase letters so the run exercises the +// case-fold lookup path. +func BenchmarkQwpSenderSteadyStateMixedCase(b *testing.B) { + _, iter := qwpSteadyStateSetupMixedCase() + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + iter() + } +} + +// TestQwpSenderSteadyStateMixedCaseZeroAllocs pins the 0-allocs/op +// invariant for mixed-case column names. The all-lowercase steady-state +// pins do not cover it: strings.ToLower returns its input unchanged for +// a lowercase name, so only an uppercase letter exposes a per-column key +// allocation on the cursor-miss path. See sibling tests for the -race +// caveat. +func TestQwpSenderSteadyStateMixedCaseZeroAllocs(t *testing.T) { + if raceEnabled { + t.Skip("zero-alloc invariant does not hold under -race") + } + _, iter := qwpSteadyStateSetupMixedCase() + if allocs := testing.AllocsPerRun(100, iter); allocs > 0 { + t.Fatalf("steady-state-mixed-case allocs/op = %g, want 0", allocs) + } +} + // BenchmarkQwpColumnAdd measures per-column add throughput. func BenchmarkQwpColumnAdd(b *testing.B) { b.Run("Long", func(b *testing.B) { @@ -327,3 +406,65 @@ func BenchmarkQwpColumnAdd(b *testing.B) { } }) } + +// BenchmarkQwpGorillaDecode measures Gorilla DoD decoding throughput +// over a long timestamp column. The bit reader's hot loop issues up to +// four single-bit prefix reads plus one wide signed read per row, so +// this is the regression gate for the 8-byte LE refill optimisation in +// qwpBitReader.readBits / readBitsSlow. +func BenchmarkQwpGorillaDecode(b *testing.B) { + const n = 4096 + mk := func(stepFn func(i int) int64) ([]byte, int64, int64) { + ts := make([]int64, n) + var cur int64 + for i := range ts { + cur += stepFn(i) + ts[i] = cur + } + var wb qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb, intsToBytes(ts), n) + // Strip the 16-byte uncompressed prefix the bit reader doesn't + // touch — the decoder's reset() takes only the bit-packed tail. + return append([]byte(nil), wb.bytes()[16:]...), ts[0], ts[1] + } + + constantData, constantTs0, constantTs1 := mk(func(int) int64 { return 1000 }) + smallData, smallTs0, smallTs1 := mk(func(i int) int64 { + // Most DoDs land in the 1- or 9-bit bucket. + return 1000 + int64((i*37)%5) - 2 + }) + wideData, wideTs0, wideTs1 := mk(func(i int) int64 { + // Forces the 32-bit bucket via large alternating jumps. + if i%2 == 0 { + return 1_000_000 + } + return 1 + }) + + cases := []struct { + name string + data []byte + ts0 int64 + ts1 int64 + }{ + {"ConstantDelta", constantData, constantTs0, constantTs1}, + {"SmallJitter", smallData, smallTs0, smallTs1}, + {"WideJitter", wideData, wideTs0, wideTs1}, + } + + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + var dec qwpGorillaDecoder + b.ResetTimer() + for i := 0; i < b.N; i++ { + dec.reset(c.ts0, c.ts1, c.data) + for j := 2; j < n; j++ { + if _, err := dec.decodeNext(); err != nil { + b.Fatalf("decodeNext[%d]: %v", j, err) + } + } + } + }) + } +} diff --git a/qwp_bind_values.go b/qwp_bind_values.go new file mode 100644 index 00000000..1c86a6a4 --- /dev/null +++ b/qwp_bind_values.go @@ -0,0 +1,635 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "fmt" + "math" +) + +// QwpBinds is a typed bind-parameter sink for a single QWP egress query. +// Encodes the per-bind wire layout directly into a reusable scratch +// buffer: +// +// type_code(1B) | null_flag(1B) | [bitmap(1B) if null_flag != 0] | [value bytes if !null] +// +// Non-null: type | 0x00 | value. Null: type | 0x01 | 0x01 (no value bytes). +// +// Indexes must be assigned in strictly ascending order starting at 0. +// The sink tracks the next expected index and latches an error on gaps +// or duplicates; the latched error is surfaced from Query / Exec instead +// of submitting the query. +// +// SQL parameter placeholders are 1-based ($1, $2, ...); indexes here +// are 0-based and map to $(index + 1). +// +// Not safe for concurrent use. One instance per QwpQueryClient is +// reused across calls; the client resets it before invoking the user- +// supplied bind function. +type QwpBinds struct { + buf []byte + count int + expectedNextIndex int + // err latches the first encoding failure; subsequent bind calls + // become no-ops so the caller can write a straight-line setter + // and surface the error from Query / Exec. Matches the ILP / QWP + // sender lastErr pattern. + err error +} + +// Bind header bytes (matches Java QwpBindValues wire layout). +const ( + qwpBindNonNullFlag byte = 0x00 + qwpBindNullFlag byte = 0x01 + qwpBindNullBitmap byte = 0x01 + // qwpGeohashMinBits matches QuestDB's ColumnType.GEOLONG_MIN_BITS + // check on the server (rejects precision 0). + qwpGeohashMinBits = 1 + // qwpGeohashMaxBits matches ColumnType.GEOLONG_MAX_BITS. + qwpGeohashMaxBits = 60 +) + +// Per-width DECIMAL scale caps. Mirrors Java QwpBindValues constants +// DECIMAL64_MAX_SCALE / DECIMAL128_MAX_SCALE / DECIMAL256_MAX_SCALE. +// The server only enforces scale <= maxDecimalScale (76) regardless of +// width; the per-width caps are a client-side preflight that surfaces +// "scale exceeds the type's representable digits" as a typed error +// before bytes leave the process. +const ( + qwpDecimal64MaxScale = 18 + qwpDecimal128MaxScale = 38 + qwpDecimal256MaxScale = 76 +) + +// Err returns the first latched bind-encoding error, or nil. Exposed for +// tests; the client checks this directly before submitting. +func (b *QwpBinds) Err() error { return b.err } + +// Count returns the number of binds encoded since the last reset. +func (b *QwpBinds) Count() int { return b.count } + +// bufferBytes returns the encoded payload. Consumed by the client to +// copy the bytes into a per-request slice before handoff to the I/O +// goroutine; not part of the public API. +func (b *QwpBinds) bufferBytes() []byte { return b.buf } + +// reset clears prior state so this instance can accumulate binds for +// a new query. Called by QwpQueryClient before every submit. +func (b *QwpBinds) reset() { + b.buf = b.buf[:0] + b.count = 0 + b.expectedNextIndex = 0 + b.err = nil +} + +// advance validates the index and bumps the counters. Returns false +// (and latches the error) on out-of-order / duplicate index or on +// exceeding the per-query bind cap. +func (b *QwpBinds) advance(index int) bool { + if b.err != nil { + return false + } + if index != b.expectedNextIndex { + b.err = fmt.Errorf( + "qwp bind: index out of order: expected %d, got %d", + b.expectedNextIndex, index) + return false + } + if b.count >= qwpMaxBindsPerQuery { + b.err = fmt.Errorf( + "qwp bind: too many binds: exceeds %d", qwpMaxBindsPerQuery) + return false + } + b.expectedNextIndex++ + b.count++ + return true +} + +// writeHeader appends the type code, null flag, and (when isNull) the +// null bitmap byte. +func (b *QwpBinds) writeHeader(t qwpTypeCode, isNull bool) { + b.buf = append(b.buf, byte(t)) + if isNull { + b.buf = append(b.buf, qwpBindNullFlag, qwpBindNullBitmap) + } else { + b.buf = append(b.buf, qwpBindNonNullFlag) + } +} + +func (b *QwpBinds) appendUint16LE(v uint16) { + b.buf = binary.LittleEndian.AppendUint16(b.buf, v) +} + +func (b *QwpBinds) appendUint32LE(v uint32) { + b.buf = binary.LittleEndian.AppendUint32(b.buf, v) +} + +func (b *QwpBinds) appendUint64LE(v uint64) { + b.buf = binary.LittleEndian.AppendUint64(b.buf, v) +} + +func (b *QwpBinds) appendVarint(v uint64) { + var tmp [qwpMaxVarintLen]byte + n := qwpPutVarint(tmp[:], v) + b.buf = append(b.buf, tmp[:n]...) +} + +// BooleanBind binds a BOOLEAN ($(index+1)) parameter. +func (b *QwpBinds) BooleanBind(index int, value bool) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeBoolean, false) + if value { + b.buf = append(b.buf, 1) + } else { + b.buf = append(b.buf, 0) + } + return b +} + +// NullBooleanBind binds a NULL BOOLEAN parameter. +func (b *QwpBinds) NullBooleanBind(index int) *QwpBinds { return b.setNull(index, qwpTypeBoolean) } + +// ByteBind binds a BYTE (int8) parameter. +func (b *QwpBinds) ByteBind(index int, value int8) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeByte, false) + b.buf = append(b.buf, byte(value)) + return b +} + +// NullByteBind binds a NULL BYTE parameter. +func (b *QwpBinds) NullByteBind(index int) *QwpBinds { return b.setNull(index, qwpTypeByte) } + +// ShortBind binds a SHORT (int16) parameter. +func (b *QwpBinds) ShortBind(index int, value int16) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeShort, false) + b.appendUint16LE(uint16(value)) + return b +} + +// NullShortBind binds a NULL SHORT parameter. +func (b *QwpBinds) NullShortBind(index int) *QwpBinds { return b.setNull(index, qwpTypeShort) } + +// CharBind binds a CHAR parameter stored as a UTF-16 code unit. +// Runes outside the BMP (> U+FFFF) are rejected — QuestDB's CHAR is a +// single UTF-16 code unit, matching Java char semantics. +func (b *QwpBinds) CharBind(index int, value rune) *QwpBinds { + if b.err != nil { + return b + } + if value < 0 || value > 0xFFFF { + b.err = fmt.Errorf("qwp bind: CHAR rune %U does not fit in a UTF-16 code unit", value) + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeChar, false) + b.appendUint16LE(uint16(value)) + return b +} + +// NullCharBind binds a NULL CHAR parameter. +func (b *QwpBinds) NullCharBind(index int) *QwpBinds { return b.setNull(index, qwpTypeChar) } + +// IntBind binds an INT (int32) parameter. +func (b *QwpBinds) IntBind(index int, value int32) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeInt, false) + b.appendUint32LE(uint32(value)) + return b +} + +// NullIntBind binds a NULL INT parameter. +func (b *QwpBinds) NullIntBind(index int) *QwpBinds { return b.setNull(index, qwpTypeInt) } + +// LongBind binds a LONG (int64) parameter. +func (b *QwpBinds) LongBind(index int, value int64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeLong, false) + b.appendUint64LE(uint64(value)) + return b +} + +// NullLongBind binds a NULL LONG parameter. +func (b *QwpBinds) NullLongBind(index int) *QwpBinds { return b.setNull(index, qwpTypeLong) } + +// FloatBind binds a FLOAT (float32) parameter. +func (b *QwpBinds) FloatBind(index int, value float32) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeFloat, false) + b.appendUint32LE(math.Float32bits(value)) + return b +} + +// NullFloatBind binds a NULL FLOAT parameter. +func (b *QwpBinds) NullFloatBind(index int) *QwpBinds { return b.setNull(index, qwpTypeFloat) } + +// DoubleBind binds a DOUBLE (float64) parameter. +func (b *QwpBinds) DoubleBind(index int, value float64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDouble, false) + b.appendUint64LE(math.Float64bits(value)) + return b +} + +// NullDoubleBind binds a NULL DOUBLE parameter. +func (b *QwpBinds) NullDoubleBind(index int) *QwpBinds { return b.setNull(index, qwpTypeDouble) } + +// DateBind binds a DATE parameter (milliseconds since epoch). +func (b *QwpBinds) DateBind(index int, millisSinceEpoch int64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDate, false) + b.appendUint64LE(uint64(millisSinceEpoch)) + return b +} + +// NullDateBind binds a NULL DATE parameter. +func (b *QwpBinds) NullDateBind(index int) *QwpBinds { return b.setNull(index, qwpTypeDate) } + +// TimestampMicrosBind binds a TIMESTAMP parameter (microseconds since epoch). +func (b *QwpBinds) TimestampMicrosBind(index int, microsSinceEpoch int64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeTimestamp, false) + b.appendUint64LE(uint64(microsSinceEpoch)) + return b +} + +// NullTimestampMicrosBind binds a NULL TIMESTAMP parameter. +func (b *QwpBinds) NullTimestampMicrosBind(index int) *QwpBinds { + return b.setNull(index, qwpTypeTimestamp) +} + +// TimestampNanosBind binds a TIMESTAMP_NANOS parameter (nanoseconds since epoch). +func (b *QwpBinds) TimestampNanosBind(index int, nanosSinceEpoch int64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeTimestampNano, false) + b.appendUint64LE(uint64(nanosSinceEpoch)) + return b +} + +// NullTimestampNanosBind binds a NULL TIMESTAMP_NANOS parameter. +func (b *QwpBinds) NullTimestampNanosBind(index int) *QwpBinds { + return b.setNull(index, qwpTypeTimestampNano) +} + +// UuidBind binds a UUID parameter from high and low 64-bit halves. +// Wire order matches QuestDB's UUID layout: lo first, then hi. +func (b *QwpBinds) UuidBind(index int, hi, lo uint64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeUuid, false) + b.appendUint64LE(lo) + b.appendUint64LE(hi) + return b +} + +// NullUuidBind binds a NULL UUID parameter. +func (b *QwpBinds) NullUuidBind(index int) *QwpBinds { return b.setNull(index, qwpTypeUuid) } + +// Long256Bind binds a LONG256 parameter from four 64-bit limbs in LE order. +func (b *QwpBinds) Long256Bind(index int, l0, l1, l2, l3 uint64) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeLong256, false) + b.appendUint64LE(l0) + b.appendUint64LE(l1) + b.appendUint64LE(l2) + b.appendUint64LE(l3) + return b +} + +// NullLong256Bind binds a NULL LONG256 parameter. +func (b *QwpBinds) NullLong256Bind(index int) *QwpBinds { return b.setNull(index, qwpTypeLong256) } + +// GeohashBind binds a GEOHASH parameter with the given precision in +// bits (1..60) and packed value. The low ceil(precisionBits/8) bytes of +// value are written little-endian on the wire. +// +// value is masked to precisionBits before encoding, so bits above the +// declared precision cannot leak into the top wire byte (which would +// otherwise pass through when precisionBits is not a multiple of 8). +func (b *QwpBinds) GeohashBind(index int, value uint64, precisionBits int) *QwpBinds { + if b.err != nil { + return b + } + if precisionBits < qwpGeohashMinBits || precisionBits > qwpGeohashMaxBits { + b.err = fmt.Errorf( + "qwp bind: GEOHASH precision must be in [%d, %d], got %d", + qwpGeohashMinBits, qwpGeohashMaxBits, precisionBits) + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeGeohash, false) + b.appendVarint(uint64(precisionBits)) + if precisionBits < 64 { + value &= (uint64(1) << precisionBits) - 1 + } + byteCount := (precisionBits + 7) >> 3 + for i := 0; i < byteCount; i++ { + b.buf = append(b.buf, byte(value>>(i*8))) + } + return b +} + +// NullGeohashBind binds a NULL GEOHASH parameter with the minimum +// precision (1 bit). The server reads the precision_bits varint +// regardless of null, so a precision must be present on the wire even +// for null. Use NullGeohashBindWithPrecision for explicit control. +func (b *QwpBinds) NullGeohashBind(index int) *QwpBinds { + return b.NullGeohashBindWithPrecision(index, qwpGeohashMinBits) +} + +// NullGeohashBindWithPrecision binds a NULL GEOHASH parameter with the +// given precision. Mirrors Java's setNullGeohash. +func (b *QwpBinds) NullGeohashBindWithPrecision(index int, precisionBits int) *QwpBinds { + if b.err != nil { + return b + } + if precisionBits < qwpGeohashMinBits || precisionBits > qwpGeohashMaxBits { + b.err = fmt.Errorf( + "qwp bind: GEOHASH precision must be in [%d, %d], got %d", + qwpGeohashMinBits, qwpGeohashMaxBits, precisionBits) + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeGeohash, true) + b.appendVarint(uint64(precisionBits)) + return b +} + +// VarcharBind binds a VARCHAR parameter. Wire encoding is: +// offset0(u32 LE = 0) | length_bytes(u32 LE) | UTF-8 bytes. +func (b *QwpBinds) VarcharBind(index int, value string) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeVarchar, false) + b.appendUint32LE(0) + b.appendUint32LE(uint32(len(value))) + b.buf = append(b.buf, value...) + return b +} + +// NullVarcharBind binds a NULL VARCHAR parameter. +func (b *QwpBinds) NullVarcharBind(index int) *QwpBinds { return b.setNull(index, qwpTypeVarchar) } + +// Decimal64Bind binds a DECIMAL64 parameter from an explicit scale and +// unscaled int64. Scale must be in [0, 18]; DECIMAL64 can only store 18 +// digits of precision, so a higher scale is mathematically invalid. +func (b *QwpBinds) Decimal64Bind(index int, scale int, unscaled int64) *QwpBinds { + if !b.checkScale64(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal64, false) + b.buf = append(b.buf, byte(scale)) + b.appendUint64LE(uint64(unscaled)) + return b +} + +// NullDecimal64Bind binds a NULL DECIMAL64 parameter with implicit +// scale 0. The server reads the scale byte regardless of null, so the +// scale must be present on the wire even for null. Use +// NullDecimal64BindWithScale to bind a NULL with a specific scale. +func (b *QwpBinds) NullDecimal64Bind(index int) *QwpBinds { + return b.NullDecimal64BindWithScale(index, 0) +} + +// NullDecimal64BindWithScale binds a NULL DECIMAL64 parameter with the +// given scale. The scale becomes part of the bound variable's type on +// the server, so it is required for NULL the same way as for non-null. +// Mirrors Java's setNullDecimal64. +func (b *QwpBinds) NullDecimal64BindWithScale(index int, scale int) *QwpBinds { + if !b.checkScale64(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal64, true) + b.buf = append(b.buf, byte(scale)) + return b +} + +// Decimal128Bind binds a DECIMAL128 parameter from an explicit scale and +// 128-bit unscaled value split into lo / hi 64-bit halves (wire order: +// lo then hi, little-endian). Scale must be in [0, 38]. +func (b *QwpBinds) Decimal128Bind(index int, scale int, lo, hi uint64) *QwpBinds { + if !b.checkScale128(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal128, false) + b.buf = append(b.buf, byte(scale)) + b.appendUint64LE(lo) + b.appendUint64LE(hi) + return b +} + +// NullDecimal128Bind binds a NULL DECIMAL128 parameter with implicit +// scale 0. See NullDecimal64Bind for the rationale. Use +// NullDecimal128BindWithScale for an explicit scale. +func (b *QwpBinds) NullDecimal128Bind(index int) *QwpBinds { + return b.NullDecimal128BindWithScale(index, 0) +} + +// NullDecimal128BindWithScale binds a NULL DECIMAL128 parameter with +// the given scale. Mirrors Java's setNullDecimal128. +func (b *QwpBinds) NullDecimal128BindWithScale(index int, scale int) *QwpBinds { + if !b.checkScale128(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal128, true) + b.buf = append(b.buf, byte(scale)) + return b +} + +// Decimal256Bind binds a DECIMAL256 parameter from an explicit scale and +// 256-bit unscaled value split into four 64-bit limbs (wire order: +// ll, lh, hl, hh, each little-endian). Scale must be in [0, 76]. +func (b *QwpBinds) Decimal256Bind(index int, scale int, ll, lh, hl, hh uint64) *QwpBinds { + if !b.checkScale256(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal256, false) + b.buf = append(b.buf, byte(scale)) + b.appendUint64LE(ll) + b.appendUint64LE(lh) + b.appendUint64LE(hl) + b.appendUint64LE(hh) + return b +} + +// NullDecimal256Bind binds a NULL DECIMAL256 parameter with implicit +// scale 0. See NullDecimal64Bind for the rationale. Use +// NullDecimal256BindWithScale for an explicit scale. +func (b *QwpBinds) NullDecimal256Bind(index int) *QwpBinds { + return b.NullDecimal256BindWithScale(index, 0) +} + +// NullDecimal256BindWithScale binds a NULL DECIMAL256 parameter with +// the given scale. Mirrors Java's setNullDecimal256. +func (b *QwpBinds) NullDecimal256BindWithScale(index int, scale int) *QwpBinds { + if !b.checkScale256(scale) { + return b + } + if !b.advance(index) { + return b + } + b.writeHeader(qwpTypeDecimal256, true) + b.buf = append(b.buf, byte(scale)) + return b +} + +// DecimalBind binds a parameter from a Decimal value, choosing the +// narrowest DECIMAL64 / 128 / 256 wire type that holds the unscaled +// coefficient. A NULL Decimal encodes as a typed DECIMAL256 null with +// scale 0. +func (b *QwpBinds) DecimalBind(index int, value Decimal) *QwpBinds { + if b.err != nil { + return b + } + if value.isNull() { + return b.NullDecimal256BindWithScale(index, 0) + } + if err := value.ensureValidScale(); err != nil { + b.err = fmt.Errorf("qwp bind: %w", err) + return b + } + // Pick the smallest fixed-width form the coefficient fits into. + // offset is the index of the most-significant byte in the 32-byte + // big-endian unscaled buffer; 32-offset is the number of + // significant bytes. + sigBytes := 32 - int(value.offset) + var wireSize int + var typeCode qwpTypeCode + switch { + case sigBytes <= 8: + wireSize = 8 + typeCode = qwpTypeDecimal64 + case sigBytes <= 16: + wireSize = 16 + typeCode = qwpTypeDecimal128 + default: + wireSize = 32 + typeCode = qwpTypeDecimal256 + } + if !b.advance(index) { + return b + } + b.writeHeader(typeCode, false) + b.buf = append(b.buf, byte(value.scale)) + // Convert the 32-byte BE unscaled representation to a sign-extended + // LE slice of wireSize bytes. wireSize is picked above so the + // significant bytes always fit, so the inner loop only needs to + // sign-extend across positions below value.offset. Shape matches + // addDecimal's write loop in qwp_buffer.go. + var signByte byte + if value.offset < 32 && value.unscaled[value.offset]&0x80 != 0 { + signByte = 0xFF + } + for i := 0; i < wireSize; i++ { + srcIdx := 31 - i + if uint8(srcIdx) < value.offset { + b.buf = append(b.buf, signByte) + } else { + b.buf = append(b.buf, value.unscaled[srcIdx]) + } + } + return b +} + +// setNull is the shared helper for per-type NullXxxBind methods. +func (b *QwpBinds) setNull(index int, t qwpTypeCode) *QwpBinds { + if !b.advance(index) { + return b + } + b.writeHeader(t, true) + return b +} + +func (b *QwpBinds) checkScale64(scale int) bool { + return b.checkScaleRange(scale, qwpDecimal64MaxScale, "DECIMAL64") +} + +func (b *QwpBinds) checkScale128(scale int) bool { + return b.checkScaleRange(scale, qwpDecimal128MaxScale, "DECIMAL128") +} + +func (b *QwpBinds) checkScale256(scale int) bool { + return b.checkScaleRange(scale, qwpDecimal256MaxScale, "DECIMAL256") +} + +func (b *QwpBinds) checkScaleRange(scale, maxScale int, typeName string) bool { + if b.err != nil { + return false + } + if scale < 0 || scale > maxScale { + b.err = fmt.Errorf( + "qwp bind: %s scale must be in [0, %d], got %d", + typeName, maxScale, scale) + return false + } + return true +} diff --git a/qwp_bind_values_test.go b/qwp_bind_values_test.go new file mode 100644 index 00000000..f217b26c --- /dev/null +++ b/qwp_bind_values_test.go @@ -0,0 +1,807 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "bytes" + "encoding/binary" + "math" + "strings" + "testing" +) + +// Header bytes mirrored from qwp_bind_values.go to keep the test +// independent of the production constants (so flipping a byte there +// fails the tests rather than silently passing). +const ( + testBindNonNull byte = 0x00 + testBindNullFlag byte = 0x01 + testBindNullBitmap byte = 0x01 +) + +// --- Helpers ------------------------------------------------------------- + +type byteBuf struct{ b []byte } + +func (w *byteBuf) put(b ...byte) { w.b = append(w.b, b...) } +func (w *byteBuf) putU16(v uint16) { + var tmp [2]byte + binary.LittleEndian.PutUint16(tmp[:], v) + w.b = append(w.b, tmp[:]...) +} +func (w *byteBuf) putU32(v uint32) { + var tmp [4]byte + binary.LittleEndian.PutUint32(tmp[:], v) + w.b = append(w.b, tmp[:]...) +} +func (w *byteBuf) putU64(v uint64) { + var tmp [8]byte + binary.LittleEndian.PutUint64(tmp[:], v) + w.b = append(w.b, tmp[:]...) +} +func (w *byteBuf) putVarint(v uint64) { + for v > 0x7F { + w.b = append(w.b, byte(v&0x7F)|0x80) + v >>= 7 + } + w.b = append(w.b, byte(v)) +} + +func assertEncoded(t *testing.T, b *QwpBinds, wantCount int, want []byte) { + t.Helper() + if err := b.Err(); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if b.Count() != wantCount { + t.Fatalf("count=%d, want %d", b.Count(), wantCount) + } + if !bytes.Equal(b.bufferBytes(), want) { + t.Fatalf("encoded bytes mismatch:\n got: % x\nwant: % x", + b.bufferBytes(), want) + } +} + +// --- Per-type encoding tests -------------------------------------------- + +func TestQwpBindsBoolean(t *testing.T) { + var b QwpBinds + b.BooleanBind(0, true) + var w byteBuf + w.put(byte(qwpTypeBoolean), testBindNonNull, 1) + assertEncoded(t, &b, 1, w.b) + + b.reset() + b.BooleanBind(0, false) + var w2 byteBuf + w2.put(byte(qwpTypeBoolean), testBindNonNull, 0) + assertEncoded(t, &b, 1, w2.b) +} + +func TestQwpBindsByte(t *testing.T) { + var b QwpBinds + b.ByteBind(0, -128) + b.ByteBind(1, 0) + b.ByteBind(2, 127) + minVal := int8(-128) + var w byteBuf + w.put(byte(qwpTypeByte), testBindNonNull, byte(minVal)) + w.put(byte(qwpTypeByte), testBindNonNull, 0) + w.put(byte(qwpTypeByte), testBindNonNull, 127) + assertEncoded(t, &b, 3, w.b) +} + +func TestQwpBindsChar(t *testing.T) { + var b QwpBinds + b.CharBind(0, 'Z') + var w byteBuf + w.put(byte(qwpTypeChar), testBindNonNull) + w.putU16(uint16('Z')) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsCharRejectsNonBMP(t *testing.T) { + var b QwpBinds + b.CharBind(0, 0x1F600) // 😀 + if b.Err() == nil { + t.Fatalf("expected CharBind to reject non-BMP rune") + } + if b.Count() != 0 { + t.Fatalf("expected failed bind to leave count=0, got %d", b.Count()) + } + if !strings.Contains(b.Err().Error(), "CHAR") { + t.Fatalf("error message should mention CHAR: %v", b.Err()) + } +} + +func TestQwpBindsDate(t *testing.T) { + var b QwpBinds + b.DateBind(0, 1_700_000_000_000) + var w byteBuf + w.put(byte(qwpTypeDate), testBindNonNull) + w.putU64(uint64(int64(1_700_000_000_000))) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimal64(t *testing.T) { + var b QwpBinds + b.Decimal64Bind(0, 2, 12345) + var w byteBuf + w.put(byte(qwpTypeDecimal64), testBindNonNull, 2) + w.putU64(uint64(int64(12345))) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimal128(t *testing.T) { + var b QwpBinds + b.Decimal128Bind(0, 6, 0x0123456789ABCDEF, 0x7766554433221100) + var w byteBuf + w.put(byte(qwpTypeDecimal128), testBindNonNull, 6) + w.putU64(0x0123456789ABCDEF) + w.putU64(0x7766554433221100) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimal256(t *testing.T) { + var b QwpBinds + b.Decimal256Bind(0, 10, + 0x1111111111111111, 0x2222222222222222, + 0x3333333333333333, 0x4444444444444444) + var w byteBuf + w.put(byte(qwpTypeDecimal256), testBindNonNull, 10) + w.putU64(0x1111111111111111) + w.putU64(0x2222222222222222) + w.putU64(0x3333333333333333) + w.putU64(0x4444444444444444) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimalRejectsBadScale(t *testing.T) { + cases := []int{-1, int(maxDecimalScale) + 1} + for _, scale := range cases { + var b QwpBinds + b.Decimal64Bind(0, scale, 1) + if b.Err() == nil { + t.Fatalf("scale=%d should have been rejected", scale) + } + if !strings.Contains(b.Err().Error(), "scale") { + t.Fatalf("expected scale-related error, got: %v", b.Err()) + } + } +} + +func TestQwpBindsDecimalPerWidthScaleCaps(t *testing.T) { + // Per-width scale caps: DECIMAL64 stores up to 18 digits, + // DECIMAL128 up to 38, DECIMAL256 up to 76. The check is a + // client-side preflight — the server enforces only the global + // cap (76), so callers who bypass per-width validation can land + // in a state where the bound parameter's coefficient overflows + // the type's representable range. Mirrors Java's + // QwpBindValuesTest scale-bound rejections. + type scaleCase struct { + name string + typeName string // expected substring in error message + ok int + bad int + bind func(b *QwpBinds, scale int) *QwpBinds + } + cases := []scaleCase{ + { + name: "Decimal64", typeName: "DECIMAL64", + ok: qwpDecimal64MaxScale, + bad: qwpDecimal64MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.Decimal64Bind(0, scale, 1) + }, + }, + { + name: "Decimal128", typeName: "DECIMAL128", + ok: qwpDecimal128MaxScale, + bad: qwpDecimal128MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.Decimal128Bind(0, scale, 0, 0) + }, + }, + { + name: "Decimal256", typeName: "DECIMAL256", + ok: qwpDecimal256MaxScale, + bad: qwpDecimal256MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.Decimal256Bind(0, scale, 0, 0, 0, 0) + }, + }, + { + name: "NullDecimal64WithScale", typeName: "DECIMAL64", + ok: qwpDecimal64MaxScale, + bad: qwpDecimal64MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.NullDecimal64BindWithScale(0, scale) + }, + }, + { + name: "NullDecimal128WithScale", typeName: "DECIMAL128", + ok: qwpDecimal128MaxScale, + bad: qwpDecimal128MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.NullDecimal128BindWithScale(0, scale) + }, + }, + { + name: "NullDecimal256WithScale", typeName: "DECIMAL256", + ok: qwpDecimal256MaxScale, + bad: qwpDecimal256MaxScale + 1, + bind: func(b *QwpBinds, scale int) *QwpBinds { + return b.NullDecimal256BindWithScale(0, scale) + }, + }, + } + for _, c := range cases { + t.Run(c.name+"AcceptsBoundary", func(t *testing.T) { + var b QwpBinds + c.bind(&b, c.ok) + if err := b.Err(); err != nil { + t.Fatalf("scale=%d should be accepted: %v", c.ok, err) + } + }) + t.Run(c.name+"RejectsOverBoundary", func(t *testing.T) { + var b QwpBinds + c.bind(&b, c.bad) + if b.Err() == nil { + t.Fatalf("scale=%d should be rejected", c.bad) + } + // Error must call out the per-width type so the user knows + // to upgrade rather than bisecting on scale. + if !strings.Contains(b.Err().Error(), c.typeName) { + t.Fatalf("error %q must mention %s", b.Err(), c.typeName) + } + }) + t.Run(c.name+"RejectsNegative", func(t *testing.T) { + var b QwpBinds + c.bind(&b, -1) + if b.Err() == nil { + t.Fatalf("scale=-1 should be rejected") + } + }) + } +} + +func TestQwpBindsNullDecimalWithScale(t *testing.T) { + // NullDecimalXBindWithScale must place the explicit scale byte + // after the null bitmap so the server's setDecimal path picks up + // the correct precision/type. + cases := []struct { + name string + bind func(b *QwpBinds) *QwpBinds + typ qwpTypeCode + scale byte + }{ + {"Decimal64Scale5", + func(b *QwpBinds) *QwpBinds { return b.NullDecimal64BindWithScale(0, 5) }, + qwpTypeDecimal64, 5}, + {"Decimal128Scale20", + func(b *QwpBinds) *QwpBinds { return b.NullDecimal128BindWithScale(0, 20) }, + qwpTypeDecimal128, 20}, + {"Decimal256Scale50", + func(b *QwpBinds) *QwpBinds { return b.NullDecimal256BindWithScale(0, 50) }, + qwpTypeDecimal256, 50}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + var b QwpBinds + c.bind(&b) + var w byteBuf + w.put(byte(c.typ), testBindNullFlag, testBindNullBitmap, c.scale) + assertEncoded(t, &b, 1, w.b) + }) + } +} + +func TestQwpBindsNullGeohashWithPrecision(t *testing.T) { + // NullGeohashBindWithPrecision must place the precision varint + // after the null bitmap, matching the wire layout of a non-null + // GEOHASH bind. The server reads the varint unconditionally. + cases := []struct { + name string + precision int + }{ + {"Min", qwpGeohashMinBits}, + {"Mid", 30}, + {"Max", qwpGeohashMaxBits}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + var b QwpBinds + b.NullGeohashBindWithPrecision(0, c.precision) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNullFlag, testBindNullBitmap) + w.putVarint(uint64(c.precision)) + assertEncoded(t, &b, 1, w.b) + }) + } + t.Run("DefaultUsesMinBits", func(t *testing.T) { + var b QwpBinds + b.NullGeohashBind(0) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNullFlag, testBindNullBitmap) + w.putVarint(uint64(qwpGeohashMinBits)) + assertEncoded(t, &b, 1, w.b) + }) + t.Run("RejectsZero", func(t *testing.T) { + var b QwpBinds + b.NullGeohashBindWithPrecision(0, 0) + if b.Err() == nil { + t.Fatalf("precision=0 must be rejected") + } + }) + t.Run("RejectsTooLarge", func(t *testing.T) { + var b QwpBinds + b.NullGeohashBindWithPrecision(0, qwpGeohashMaxBits+1) + if b.Err() == nil { + t.Fatalf("precision=%d must be rejected", qwpGeohashMaxBits+1) + } + }) +} + +func TestQwpBindsDouble(t *testing.T) { + var b QwpBinds + b.DoubleBind(0, 2.718281828) + var w byteBuf + w.put(byte(qwpTypeDouble), testBindNonNull) + w.putU64(math.Float64bits(2.718281828)) + assertEncoded(t, &b, 1, w.b) + + b.reset() + b.DoubleBind(0, math.NaN()) + var w2 byteBuf + w2.put(byte(qwpTypeDouble), testBindNonNull) + w2.putU64(math.Float64bits(math.NaN())) + assertEncoded(t, &b, 1, w2.b) +} + +func TestQwpBindsFloat(t *testing.T) { + var b QwpBinds + b.FloatBind(0, 3.14) + var w byteBuf + w.put(byte(qwpTypeFloat), testBindNonNull) + w.putU32(math.Float32bits(3.14)) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsGeohashMinMax(t *testing.T) { + t.Run("min", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 1, 1) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(1) + w.put(0x01) + assertEncoded(t, &b, 1, w.b) + }) + t.Run("max", func(t *testing.T) { + var b QwpBinds + value := uint64(0x0FFF_FFFF_FFFF_FFFF) + b.GeohashBind(0, value, 60) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(60) + for i := 0; i < 8; i++ { + w.put(byte(value >> (i * 8))) + } + assertEncoded(t, &b, 1, w.b) + }) +} + +func TestQwpBindsGeohashMasksHighBits(t *testing.T) { + // precisionBits=5 keeps only the low 5 bits; the wire byte should + // be 0x1F regardless of the high bits in value. + t.Run("subByte", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 0xFFFF_FFFF_FFFF_FFFF, 5) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(5) + w.put(0x1F) + assertEncoded(t, &b, 1, w.b) + }) + // Non-byte-aligned across a byte boundary. precisionBits=12 + // emits 2 wire bytes; only the low 4 bits of the second byte + // carry payload, the upper nibble must be zero. Mirrors Java's + // boundary-bug regression: an unmasked value would leak the + // shifted-in high bit into the second wire byte's upper nibble. + t.Run("subNibbleAcrossByte_12", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 0xFFFF_FFFF_FFFF_FFFF, 12) + masked := uint64(0x0FFF) // low 12 bits of all-ones + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(12) + w.put(byte(masked)) + w.put(byte(masked >> 8)) + assertEncoded(t, &b, 1, w.b) + }) + // precisionBits=20 emits 3 wire bytes; only the low 4 bits of + // the third byte carry payload. + t.Run("nonByteAligned_20", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 0xFFFF_FFFF_FFFF_FFFF, 20) + masked := uint64(0x0F_FFFF) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(20) + for i := 0; i < 3; i++ { + w.put(byte(masked >> (i * 8))) + } + assertEncoded(t, &b, 1, w.b) + }) + // Byte-aligned mid-range. precisionBits=24 emits exactly 3 wire + // bytes; every bit is payload. + t.Run("byteAligned_24", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 0xFFFF_FFFF_FFFF_FFFF, 24) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(24) + w.put(0xFF, 0xFF, 0xFF) + assertEncoded(t, &b, 1, w.b) + }) + // precisionBits=60 (max). Only the low 60 bits matter; the top + // nibble of the highest wire byte must be zero. + t.Run("maxPrecision", func(t *testing.T) { + var b QwpBinds + b.GeohashBind(0, 0xFFFF_FFFF_FFFF_FFFF, 60) + var w byteBuf + w.put(byte(qwpTypeGeohash), testBindNonNull) + w.putVarint(60) + masked := uint64(0x0FFF_FFFF_FFFF_FFFF) + for i := 0; i < 8; i++ { + w.put(byte(masked >> (i * 8))) + } + assertEncoded(t, &b, 1, w.b) + }) +} + +func TestQwpBindsGeohashRejectsOutOfRange(t *testing.T) { + cases := []int{0, 61, -1} + for _, p := range cases { + var b QwpBinds + b.GeohashBind(0, 1, p) + if b.Err() == nil { + t.Fatalf("precision=%d should have been rejected", p) + } + if !strings.Contains(b.Err().Error(), "precision") { + t.Fatalf("expected precision-related error, got: %v", b.Err()) + } + } +} + +func TestQwpBindsInt(t *testing.T) { + var b QwpBinds + minVal := int32(math.MinInt32) + maxVal := int32(math.MaxInt32) + b.IntBind(0, minVal).IntBind(1, 0).IntBind(2, maxVal) + var w byteBuf + w.put(byte(qwpTypeInt), testBindNonNull) + w.putU32(uint32(minVal)) + w.put(byte(qwpTypeInt), testBindNonNull) + w.putU32(0) + w.put(byte(qwpTypeInt), testBindNonNull) + w.putU32(uint32(maxVal)) + assertEncoded(t, &b, 3, w.b) +} + +func TestQwpBindsLong(t *testing.T) { + var b QwpBinds + b.LongBind(0, 42) + var w byteBuf + w.put(byte(qwpTypeLong), testBindNonNull) + w.putU64(42) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsLong256(t *testing.T) { + var b QwpBinds + b.Long256Bind(0, 0x1111111111111111, 0x2222222222222222, + 0x3333333333333333, 0x4444444444444444) + var w byteBuf + w.put(byte(qwpTypeLong256), testBindNonNull) + w.putU64(0x1111111111111111) + w.putU64(0x2222222222222222) + w.putU64(0x3333333333333333) + w.putU64(0x4444444444444444) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsMixedTypes(t *testing.T) { + var b QwpBinds + b.LongBind(0, 1234567890). + VarcharBind(1, "hello"). + BooleanBind(2, true). + DoubleBind(3, 1.5) + + var w byteBuf + w.put(byte(qwpTypeLong), testBindNonNull) + w.putU64(1234567890) + w.put(byte(qwpTypeVarchar), testBindNonNull) + w.putU32(0) + w.putU32(5) + w.put([]byte("hello")...) + w.put(byte(qwpTypeBoolean), testBindNonNull, 1) + w.put(byte(qwpTypeDouble), testBindNonNull) + w.putU64(math.Float64bits(1.5)) + + assertEncoded(t, &b, 4, w.b) +} + +func TestQwpBindsNullExhaustive(t *testing.T) { + var b QwpBinds + b.NullBooleanBind(0). + NullByteBind(1). + NullShortBind(2). + NullCharBind(3). + NullIntBind(4). + NullLongBind(5). + NullFloatBind(6). + NullDoubleBind(7). + NullDateBind(8). + NullTimestampMicrosBind(9). + NullTimestampNanosBind(10). + NullUuidBind(11). + NullLong256Bind(12). + NullGeohashBind(13). + NullVarcharBind(14). + NullDecimal64Bind(15). + NullDecimal128Bind(16). + NullDecimal256Bind(17) + + // Plain null types (no metadata after the bitmap byte). + plainTypes := []qwpTypeCode{ + qwpTypeBoolean, qwpTypeByte, qwpTypeShort, qwpTypeChar, + qwpTypeInt, qwpTypeLong, qwpTypeFloat, qwpTypeDouble, + qwpTypeDate, qwpTypeTimestamp, qwpTypeTimestampNano, + qwpTypeUuid, qwpTypeLong256, // 13 entries + } + + var w byteBuf + for _, tc := range plainTypes { + w.put(byte(tc), testBindNullFlag, testBindNullBitmap) + } + // GEOHASH null carries the precision varint after the bitmap; the + // server reads it unconditionally, even for null. Default precision + // is qwpGeohashMinBits=1. + w.put(byte(qwpTypeGeohash), testBindNullFlag, testBindNullBitmap) + w.putVarint(uint64(qwpGeohashMinBits)) + // VARCHAR null is plain. + w.put(byte(qwpTypeVarchar), testBindNullFlag, testBindNullBitmap) + // DECIMAL64/128/256 null carry a 1-byte scale (default 0). The + // server's setDecimal path reads this byte unconditionally; without + // it the next bind's type code would be misread as a scale. + w.put(byte(qwpTypeDecimal64), testBindNullFlag, testBindNullBitmap, 0x00) + w.put(byte(qwpTypeDecimal128), testBindNullFlag, testBindNullBitmap, 0x00) + w.put(byte(qwpTypeDecimal256), testBindNullFlag, testBindNullBitmap, 0x00) + + assertEncoded(t, &b, 18, w.b) +} + +func TestQwpBindsShort(t *testing.T) { + var b QwpBinds + minVal := int16(math.MinInt16) + maxVal := int16(math.MaxInt16) + b.ShortBind(0, minVal).ShortBind(1, 0).ShortBind(2, maxVal) + var w byteBuf + w.put(byte(qwpTypeShort), testBindNonNull) + w.putU16(uint16(minVal)) + w.put(byte(qwpTypeShort), testBindNonNull) + w.putU16(0) + w.put(byte(qwpTypeShort), testBindNonNull) + w.putU16(uint16(maxVal)) + assertEncoded(t, &b, 3, w.b) +} + +func TestQwpBindsTimestampMicros(t *testing.T) { + var b QwpBinds + b.TimestampMicrosBind(0, 1_700_000_000_000_000) + var w byteBuf + w.put(byte(qwpTypeTimestamp), testBindNonNull) + w.putU64(uint64(int64(1_700_000_000_000_000))) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsTimestampNanos(t *testing.T) { + var b QwpBinds + b.TimestampNanosBind(0, 1_700_000_000_000_000_000) + var w byteBuf + w.put(byte(qwpTypeTimestampNano), testBindNonNull) + w.putU64(uint64(int64(1_700_000_000_000_000_000))) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsUuid(t *testing.T) { + var b QwpBinds + b.UuidBind(0, 0x0BADF00DDEADBEEF, 0xFEEDFACECAFEBEEF) + var w byteBuf + // Wire order: lo first, then hi. + w.put(byte(qwpTypeUuid), testBindNonNull) + w.putU64(0xFEEDFACECAFEBEEF) + w.putU64(0x0BADF00DDEADBEEF) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsVarcharAscii(t *testing.T) { + var b QwpBinds + b.VarcharBind(0, "hello") + var w byteBuf + w.put(byte(qwpTypeVarchar), testBindNonNull) + w.putU32(0) + w.putU32(5) + w.put([]byte("hello")...) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsVarcharEmpty(t *testing.T) { + var b QwpBinds + b.VarcharBind(0, "") + var w byteBuf + w.put(byte(qwpTypeVarchar), testBindNonNull) + w.putU32(0) + w.putU32(0) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsVarcharUnicode(t *testing.T) { + const value = "café" + var b QwpBinds + b.VarcharBind(0, value) + utf8Bytes := []byte(value) + var w byteBuf + w.put(byte(qwpTypeVarchar), testBindNonNull) + w.putU32(0) + w.putU32(uint32(len(utf8Bytes))) + w.put(utf8Bytes...) + assertEncoded(t, &b, 1, w.b) +} + +// --- Decimal bind from Decimal struct ------------------------------------ + +func TestQwpBindsDecimalAutoWidthFitsInt64(t *testing.T) { + d := NewDecimalFromInt64(12345, 2) + var b QwpBinds + b.DecimalBind(0, d) + // unscaled 12345 fits in 8 bytes -> DECIMAL64. + var w byteBuf + w.put(byte(qwpTypeDecimal64), testBindNonNull, 2) + var signExtended [8]byte + binary.LittleEndian.PutUint64(signExtended[:], uint64(int64(12345))) + w.put(signExtended[:]...) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimalAutoWidthNegativeInt64(t *testing.T) { + d := NewDecimalFromInt64(-1, 0) + var b QwpBinds + b.DecimalBind(0, d) + var w byteBuf + w.put(byte(qwpTypeDecimal64), testBindNonNull, 0) + var signExtended [8]byte + negOne := int64(-1) + binary.LittleEndian.PutUint64(signExtended[:], uint64(negOne)) + w.put(signExtended[:]...) + assertEncoded(t, &b, 1, w.b) +} + +func TestQwpBindsDecimalAutoWidthNull(t *testing.T) { + nullDecimal, err := NewDecimalUnsafe(nil, 0) + if err != nil { + t.Fatalf("NewDecimalUnsafe: %v", err) + } + var b QwpBinds + b.DecimalBind(0, nullDecimal) + // NULL Decimal canonicalises to DECIMAL256 with scale 0; the scale + // byte must be on the wire (the server reads it unconditionally). + var w byteBuf + w.put(byte(qwpTypeDecimal256), testBindNullFlag, testBindNullBitmap, 0x00) + assertEncoded(t, &b, 1, w.b) +} + +// --- Ordering and limit checks ------------------------------------------- + +func TestQwpBindsRejectsDuplicateIndex(t *testing.T) { + var b QwpBinds + b.LongBind(0, 1).LongBind(0, 2) + if b.Err() == nil { + t.Fatal("expected duplicate index to be rejected") + } + if !strings.Contains(b.Err().Error(), "out of order") { + t.Fatalf("got error: %v", b.Err()) + } +} + +func TestQwpBindsRejectsOutOfOrderIndex(t *testing.T) { + var b QwpBinds + b.LongBind(0, 1).LongBind(2, 3) + if b.Err() == nil { + t.Fatal("expected non-contiguous index to be rejected") + } +} + +func TestQwpBindsTooMany(t *testing.T) { + var b QwpBinds + for i := 0; i < qwpMaxBindsPerQuery; i++ { + b.IntBind(i, int32(i)) + } + if err := b.Err(); err != nil { + t.Fatalf("filling %d binds should succeed: %v", qwpMaxBindsPerQuery, err) + } + b.IntBind(qwpMaxBindsPerQuery, 0) + if b.Err() == nil { + t.Fatalf("exceeding %d binds should fail", qwpMaxBindsPerQuery) + } + if !strings.Contains(b.Err().Error(), "too many") { + t.Fatalf("got error: %v", b.Err()) + } +} + +// --- Reset invariants ---------------------------------------------------- + +func TestQwpBindsResetPreservesBuffer(t *testing.T) { + var b QwpBinds + b.LongBind(0, 42).IntBind(1, 7) + first := append([]byte(nil), b.bufferBytes()...) + + b.reset() + if b.Count() != 0 || len(b.bufferBytes()) != 0 || b.Err() != nil { + t.Fatalf("reset did not clear state") + } + + b.LongBind(0, 42).IntBind(1, 7) + if !bytes.Equal(first, b.bufferBytes()) { + t.Fatalf("re-encoding after reset differs") + } +} + +func TestQwpBindsBufferGrowsBeyondDefault(t *testing.T) { + var b QwpBinds + big := strings.Repeat("x", 20_000) + b.VarcharBind(0, big) + if b.Err() != nil { + t.Fatalf("unexpected error: %v", b.Err()) + } + // type(1) + flag(1) + offset0(4) + len(4) + 20000 bytes = 20010 + if got, want := len(b.bufferBytes()), 1+1+4+4+20_000; got != want { + t.Fatalf("encoded length=%d, want %d", got, want) + } +} + +// --- Fluent-chain short-circuit ----------------------------------------- + +// Once an error is latched, subsequent setters must not allocate or +// mutate the buffer. Matches the ILP / QWP ingress sender pattern. +func TestQwpBindsLatchedErrorShortCircuits(t *testing.T) { + var b QwpBinds + b.LongBind(0, 1).LongBind(5, 2) // out-of-order -> latches error at index=5 + bufBefore := append([]byte(nil), b.bufferBytes()...) + b.LongBind(6, 3).IntBind(7, 4) // must be no-ops + if !bytes.Equal(bufBefore, b.bufferBytes()) { + t.Fatalf("bind calls after latched error mutated the buffer") + } +} diff --git a/qwp_buffer.go b/qwp_buffer.go index b8341280..4b51e188 100644 --- a/qwp_buffer.go +++ b/qwp_buffer.go @@ -29,6 +29,7 @@ import ( "fmt" "math" "math/bits" + "strings" ) // qwpLongNull is the uint64 bit pattern for int64 MinInt64 @@ -88,9 +89,11 @@ type qwpColumnBuffer struct { // has rowCount+1 entries with arrayOffsets[0]==0. Row i's encoded // data spans arrayData[arrayOffsets[i]:arrayOffsets[i+1]]. // Each row's encoded data contains: - // nDims (1 byte) + shape (nDims × 4 bytes LE) + flattened + // nDims (1 byte, >= 1) + shape (nDims × 4 bytes LE) + flattened // elements (product(shape) × 8 bytes LE). - // Null arrays are encoded as nDims=1, dim0=0 (5 bytes total). + // The public ingest API always creates array columns with + // nullable=true, so NULL rows are tracked in the null bitmap and + // no inline data is appended for them. arrayOffsets []uint32 arrayData []byte @@ -411,72 +414,130 @@ func (c *qwpColumnBuffer) growArrayData(n int) int { return off } -// addDoubleArray appends an N-dimensional float64 array value -// (TYPE_DOUBLE_ARRAY). The encoded data is stored as: -// -// nDims (1 byte) + shape (nDims × 4 bytes LE) + flattened -// elements (product(shape) × 8 bytes LE, row-major order). -func (c *qwpColumnBuffer) addDoubleArray(nDims uint8, shape []int32, flatData []float64) { +// reserveArrayValue grows arrayData for one array value — a +// nDims+shape header (1 + nDims×4 bytes) followed by payloadBytes of +// flattened element data — writes the header, advances the +// arrayOffsets / dataSize / rowCount bookkeeping, and returns the +// payload sub-slice (len == payloadBytes) the caller fills with the +// little-endian elements. Centralising grow+header+bookkeeping lets +// every typed array writer stream its elements straight into arrayData +// with no intermediate flattened copy. +func (c *qwpColumnBuffer) reserveArrayValue(nDims uint8, shape []int32, payloadBytes int) []byte { metaSize := 1 + int(nDims)*4 - dataSize := len(flatData) * 8 - totalSize := metaSize + dataSize + totalSize := metaSize + payloadBytes off := c.growArrayData(totalSize) - buf := c.arrayData[off:] + buf := c.arrayData[off : off+totalSize] - // nDims buf[0] = nDims pos := 1 - - // shape: each dimension as uint32 LE for i := 0; i < int(nDims); i++ { binary.LittleEndian.PutUint32(buf[pos:], uint32(shape[i])) pos += 4 } - // flattened elements: each float64 LE - for _, v := range flatData { - binary.LittleEndian.PutUint64(buf[pos:], math.Float64bits(v)) - pos += 8 - } - c.arrayOffsets = append(c.arrayOffsets, uint32(len(c.arrayData))) c.trackDataGrowth(totalSize + 4) // array data + uint32 offset c.rowCount++ + return buf[pos:totalSize] } -// addLongArray appends an N-dimensional int64 array value -// (TYPE_LONG_ARRAY). The encoded data is stored as: +// addDoubleArray appends an N-dimensional float64 array value +// (TYPE_DOUBLE_ARRAY). The encoded data is stored as: // // nDims (1 byte) + shape (nDims × 4 bytes LE) + flattened // elements (product(shape) × 8 bytes LE, row-major order). -func (c *qwpColumnBuffer) addLongArray(nDims uint8, shape []int32, flatData []int64) { - metaSize := 1 + int(nDims)*4 - dataSize := len(flatData) * 8 - totalSize := metaSize + dataSize - - off := c.growArrayData(totalSize) - buf := c.arrayData[off:] +// +// flatData must already be row-major; the typed 2D/3D writers +// (addDoubleArray2D / addDoubleArray3D) stream their nested input in +// directly instead, avoiding an intermediate flat copy. +func (c *qwpColumnBuffer) addDoubleArray(nDims uint8, shape []int32, flatData []float64) { + dst := c.reserveArrayValue(nDims, shape, len(flatData)*8) + pos := 0 + for _, v := range flatData { + binary.LittleEndian.PutUint64(dst[pos:], math.Float64bits(v)) + pos += 8 + } +} - // nDims - buf[0] = nDims - pos := 1 +// addDoubleArray2D appends a regular 2D float64 array, streaming each +// element straight into arrayData. The caller has already validated +// the shape is regular (every row len == dim1) and within bounds. +func (c *qwpColumnBuffer) addDoubleArray2D(dim0, dim1 int, values [][]float64) { + dst := c.reserveArrayValue(2, []int32{int32(dim0), int32(dim1)}, dim0*dim1*8) + pos := 0 + for _, row := range values { + for _, v := range row { + binary.LittleEndian.PutUint64(dst[pos:], math.Float64bits(v)) + pos += 8 + } + } +} - // shape: each dimension as uint32 LE - for i := 0; i < int(nDims); i++ { - binary.LittleEndian.PutUint32(buf[pos:], uint32(shape[i])) - pos += 4 +// addDoubleArray3D appends a regular 3D float64 array, streaming each +// element straight into arrayData. The caller has already validated +// the shape is regular (every plane len == dim1, every row len == +// dim2) and within bounds. +func (c *qwpColumnBuffer) addDoubleArray3D(dim0, dim1, dim2 int, values [][][]float64) { + dst := c.reserveArrayValue(3, []int32{int32(dim0), int32(dim1), int32(dim2)}, dim0*dim1*dim2*8) + pos := 0 + for _, plane := range values { + for _, row := range plane { + for _, v := range row { + binary.LittleEndian.PutUint64(dst[pos:], math.Float64bits(v)) + pos += 8 + } + } } +} - // flattened elements: each int64 LE +// addLongArray appends an N-dimensional int64 array value +// (TYPE_LONG_ARRAY). The encoded data is stored as: +// +// nDims (1 byte) + shape (nDims × 4 bytes LE) + flattened +// elements (product(shape) × 8 bytes LE, row-major order). +// +// flatData must already be row-major; the typed 2D/3D writers +// (addLongArray2D / addLongArray3D) stream their nested input in +// directly instead, avoiding an intermediate flat copy. +func (c *qwpColumnBuffer) addLongArray(nDims uint8, shape []int32, flatData []int64) { + dst := c.reserveArrayValue(nDims, shape, len(flatData)*8) + pos := 0 for _, v := range flatData { - binary.LittleEndian.PutUint64(buf[pos:], uint64(v)) + binary.LittleEndian.PutUint64(dst[pos:], uint64(v)) pos += 8 } +} - c.arrayOffsets = append(c.arrayOffsets, uint32(len(c.arrayData))) - c.trackDataGrowth(totalSize + 4) // array data + uint32 offset - c.rowCount++ +// addLongArray2D appends a regular 2D int64 array, streaming each +// element straight into arrayData. The caller has already validated +// the shape is regular (every row len == dim1) and within bounds. +func (c *qwpColumnBuffer) addLongArray2D(dim0, dim1 int, values [][]int64) { + dst := c.reserveArrayValue(2, []int32{int32(dim0), int32(dim1)}, dim0*dim1*8) + pos := 0 + for _, row := range values { + for _, v := range row { + binary.LittleEndian.PutUint64(dst[pos:], uint64(v)) + pos += 8 + } + } +} + +// addLongArray3D appends a regular 3D int64 array, streaming each +// element straight into arrayData. The caller has already validated +// the shape is regular (every plane len == dim1, every row len == +// dim2) and within bounds. +func (c *qwpColumnBuffer) addLongArray3D(dim0, dim1, dim2 int, values [][][]int64) { + dst := c.reserveArrayValue(3, []int32{int32(dim0), int32(dim1), int32(dim2)}, dim0*dim1*dim2*8) + pos := 0 + for _, plane := range values { + for _, row := range plane { + for _, v := range row { + binary.LittleEndian.PutUint64(dst[pos:], uint64(v)) + pos += 8 + } + } + } } // addDecimal appends a Decimal value to a decimal column @@ -641,13 +702,14 @@ func (c *qwpColumnBuffer) addNull() { c.appendU64(qwpLongNull) case qwpTypeDoubleArray, qwpTypeLongArray: - // Null array sentinel: nDims=1, dim0=0 (5 bytes total). - off := len(c.arrayData) - c.arrayData = append(c.arrayData, 0, 0, 0, 0, 0) - c.arrayData[off] = 0x01 // nDims = 1 - // dim0 = 0 (already zero from append) - c.arrayOffsets = append(c.arrayOffsets, uint32(len(c.arrayData))) - c.trackDataGrowth(5 + 4) // 5 data + uint32 offset + // Unreachable from the public API: Float64Array* / Int64Array* + // always create array columns with nullable=true, so addNull + // takes the bitmap branch above. The wire format has no inline + // NULL sentinel for arrays — the server's ingest cursor and + // the Go egress decoder both reject nDims=0 — so there is no + // valid byte sequence to emit here. Fail loud rather than + // write a frame the peer will reject. + panic("qwp: addNull on a non-nullable array column is not supported") case qwpTypeGeohash: // -1 (all bits set) is the QuestDB geohash null sentinel. @@ -783,9 +845,35 @@ func (c *qwpColumnBuffer) truncateTo(n int) { // manages multiple qwpColumnBuffer instances and handles row commits // with automatic gap-filling for columns not set in a given row. type qwpTableBuffer struct { - tableName string - columns []*qwpColumnBuffer - columnIndex map[string]int // column name → index in columns slice + tableName string + columns []*qwpColumnBuffer + // columnIndex is keyed by the lowercase column name. QuestDB + // column names are case-insensitive throughout the server stack + // (TableReaderMetadata.columnNameIndexMap and + // TableUpdateDetails are LowerCase* maps), so the buffer must + // dedupe accordingly — otherwise emitting "Pressure_S" in one + // row and "pressure_s" in another within the same wire frame + // produces two distinct column definitions, and the server's + // QwpTudCache.getOrCreateTable faithfully creates two + // case-equivalent columns, corrupting the metadata file. + // Mirrors Java QwpTableBuffer.columnNameToIndex + // (LowerCaseCharSequenceIntHashMap). The column's own .name + // stays case-preserved (first-seen casing) for wire emission. + // + // It may also hold memoized casing-variant alias keys (a verbatim + // mixed-case name aliased to a column's canonical lowercase key) so + // repeat lookups of a mixed-case column skip strings.ToLower. Alias + // keys always carry an uppercase letter, so they never collide with + // the all-lowercase canonical keys. See getOrCreateColumn. + columnIndex map[string]int + + // aliasKeys records the casing-variant keys memoized into + // columnIndex. cancelRow drops them when it removes columns: an + // alias maps to a column index that the truncation (or the + // committedColumnCount==0 reset case, which removes every column) + // can leave dangling past the columns slice. They re-memoize on + // demand. + aliasKeys []string // rowCount is the number of committed (finalized) rows. rowCount int @@ -804,19 +892,20 @@ type qwpTableBuffer struct { // in the Java client. columnAccessCursor int - // schemaId is the per-connection schema identifier for this - // table's current column set. -1 means unassigned — the sender - // allocates a fresh ID from its nextSchemaId counter on the next - // flush and sends the schema in full mode. Reset to -1 whenever - // the column set changes so a new ID is allocated and the server - // re-registers the schema. - schemaId int - // dataSize is a running counter of approximate data bytes stored // across all columns. Incremented by column addX methods via // trackDataGrowth. Reset to 0 in reset(), recomputed from scratch // in cancelRow(). Makes approxDataSize() O(1). dataSize int + + // dirty marks that this buffer was selected by Table() since the + // last full flush and therefore appears in the sender's dirtyTables + // list. The sender (not reset()) owns this flag: it gates the + // append in Table() so each touched table is listed once, and + // resetAfterFlush clears it. A per-table reset() during a split + // flush deliberately leaves it set so the still-listed (now empty) + // buffer is not re-appended if the producer reuses it. + dirty bool } // newQwpTableBuffer creates a table buffer for the given table name. @@ -824,10 +913,45 @@ func newQwpTableBuffer(tableName string) *qwpTableBuffer { return &qwpTableBuffer{ tableName: tableName, columnIndex: make(map[string]int), - schemaId: -1, } } +// qwpASCIIEqualFold reports whether a and b are equal under ASCII +// case folding: bytes 'A'–'Z' and 'a'–'z' compare equal ignoring +// case, every other byte must match verbatim. This matches QuestDB's +// column-name case-insensitivity, which folds ASCII only (Java +// Chars.toLowerCaseAscii / LowerCaseCharSequenceIntHashMap). +// +// It is a sound accelerator for the lowercase-keyed columnIndex: an +// ASCII letter is never a UTF-8 continuation byte, so fold-equal +// inputs differ only in the case of standalone ASCII letters, and +// strings.ToLower maps them to the same key. A fast-path match +// therefore never disagrees with the authoritative map lookup, and a +// non-match falls through to it. +func qwpASCIIEqualFold(a, b string) bool { + if len(a) != len(b) { + return false + } + for i := 0; i < len(a); i++ { + ca, cb := a[i], b[i] + if ca == cb { + continue + } + // The bytes differ. They are ASCII case-folds of each other + // only if they differ solely in bit 5 (the 0x20 case bit) and + // the folded byte is a letter. The letter check is essential: + // OR-ing 0x20 also pairs legal name punctuation ('@'↔'`', + // '['↔'{', ']'↔'}', '^'↔'~'), which must not compare equal. + if ca^cb != 0x20 { + return false + } + if lower := ca | 0x20; lower < 'a' || lower > 'z' { + return false + } + } + return true +} + // getOrCreateColumn looks up an existing column by name or creates a // new one. Returns an error if a column with the same name but a // different type already exists, or if the column was already set @@ -835,10 +959,12 @@ func newQwpTableBuffer(tableName string) *qwpTableBuffer { func (tb *qwpTableBuffer) getOrCreateColumn(name string, typeCode qwpTypeCode, nullable bool) (*qwpColumnBuffer, error) { // Fast path: predict the next column in sequence. When columns are // set in the same order every row, this avoids the map lookup - // entirely. Falls through to the map on name mismatch. + // entirely. The compare is ASCII case-insensitive (column names + // fold case), so a mixed-case writer keeps the fast path without + // allocating a lowercase key. Falls through to the map on mismatch. if tb.columnAccessCursor < len(tb.columns) { col := tb.columns[tb.columnAccessCursor] - if col.name == name { + if qwpASCIIEqualFold(col.name, name) { if col.typeCode != typeCode { return nil, fmt.Errorf( "qwp: column %q type conflict: existing %d, got %d", @@ -853,7 +979,23 @@ func (tb *qwpTableBuffer) getOrCreateColumn(name string, typeCode qwpTypeCode, n } } + // Slow path. Probe with the name verbatim first: it hits the + // canonical lowercase key for all-lowercase names (the convention) + // and any memoized casing-variant alias, so the common case never + // calls strings.ToLower — which allocates a fresh key for a name + // with an uppercase letter, on every cursor-miss column, every row. idx, exists := tb.columnIndex[name] + if !exists { + lower := strings.ToLower(name) + if idx, exists = tb.columnIndex[lower]; exists && lower != name { + // Memoize the verbatim casing as an alias of the canonical + // key so the next row's lookup by this casing hits the probe + // above without re-lowercasing. aliasKeys records it so + // cancelRow can drop it when it removes columns. + tb.columnIndex[name] = idx + tb.aliasKeys = append(tb.aliasKeys, name) + } + } if exists { col := tb.columns[idx] if col.typeCode != typeCode { @@ -866,6 +1008,11 @@ func (tb *qwpTableBuffer) getOrCreateColumn(name string, typeCode qwpTypeCode, n if col.rowCount > tb.rowCount { return nil, fmt.Errorf("qwp: column %q already set for current row", name) } + // Resync the sequential cursor: after a sparse skip the caller + // most likely continues in column-definition order, so predict + // idx+1 next. This restores the allocation-free fast path for the + // rest of the row instead of a map lookup per remaining column. + tb.columnAccessCursor = idx + 1 return col, nil } @@ -892,9 +1039,8 @@ func (tb *qwpTableBuffer) getOrCreateColumn(name string, typeCode qwpTypeCode, n col.addNull() } - tb.columnIndex[name] = len(tb.columns) + tb.columnIndex[strings.ToLower(name)] = len(tb.columns) tb.columns = append(tb.columns, col) - tb.schemaId = -1 return col, nil } @@ -933,7 +1079,6 @@ func (tb *qwpTableBuffer) getOrCreateDesignatedTimestamp(typeCode qwpTypeCode) ( tb.columnIndex[dtName] = len(tb.columns) tb.columns = append(tb.columns, col) - tb.schemaId = -1 return col, nil } @@ -957,10 +1102,18 @@ func (tb *qwpTableBuffer) cancelRow() { // Remove columns created during this row. if len(tb.columns) > tb.committedColumnCount { for i := tb.committedColumnCount; i < len(tb.columns); i++ { - delete(tb.columnIndex, tb.columns[i].name) + delete(tb.columnIndex, strings.ToLower(tb.columns[i].name)) } tb.columns = tb.columns[:tb.committedColumnCount] - tb.schemaId = -1 + // Drop memoized casing-variant aliases. Each maps to a column + // index that the truncation above can leave dangling (and when + // committedColumnCount==0 after reset, every column is removed). + // Alias keys carry an uppercase letter, so deleting them never + // touches the all-lowercase canonical keys of surviving columns. + for _, k := range tb.aliasKeys { + delete(tb.columnIndex, k) + } + tb.aliasKeys = tb.aliasKeys[:0] } // Truncate any columns that were set during this row. @@ -977,9 +1130,13 @@ func (tb *qwpTableBuffer) cancelRow() { tb.recomputeDataSize() } -// reset clears all row data and columns, retaining the table name. -// Preserves schemaId: between flushes the column set is unchanged, -// so the server's registry entry is still valid. +// reset clears all row data, retaining the table name and the +// column structure. Column-level state (offsets, dictionary deltas) +// is reset by col.reset() per column. The cursor encoder writes a +// full schema + symbol dict for every flush, so no per-table +// "what's been sent" state needs to survive across flushes — +// rowCount and the in-progress bookkeeping are all that needs +// resetting here. func (tb *qwpTableBuffer) reset() { for _, col := range tb.columns { col.reset() diff --git a/qwp_buffer_test.go b/qwp_buffer_test.go index 1a0af4dc..955bb61e 100644 --- a/qwp_buffer_test.go +++ b/qwp_buffer_test.go @@ -995,6 +995,157 @@ func TestQwpTableBufferGetOrCreateColumn(t *testing.T) { } }) + t.Run("CaseInsensitive", func(t *testing.T) { + // QuestDB column names are case-insensitive throughout the + // server stack (LowerCaseCharSequenceIntHashMap), and Java's + // QwpTableBuffer also dedupes case-insensitively. Multiple + // case-vary'd writes across rows within one frame must + // resolve to the same buffer column — otherwise the server + // auto-creates parallel columns whose names are equal modulo + // case, corrupting the on-disk metadata. + tb := newQwpTableBuffer("t") + + col1, err := tb.getOrCreateColumn("Pressure_S", qwpTypeDouble, false) + if err != nil { + t.Fatal(err) + } + col1.addDouble(1.5) + tb.commitRow() + + col2, err := tb.getOrCreateColumn("pressure_s", qwpTypeDouble, false) + if err != nil { + t.Fatal(err) + } + if col2 != col1 { + t.Fatal("case-vary'd name should resolve to the same column") + } + if len(tb.columns) != 1 { + t.Fatalf("columns len = %d, want 1 (no parallel case-vary'd column)", len(tb.columns)) + } + // First-seen case wins on the wire (matches Java client). + if col1.name != "Pressure_S" { + t.Fatalf("col.name = %q, want %q (first-seen case preserved)", col1.name, "Pressure_S") + } + col2.addDouble(2.5) + tb.commitRow() + + // Yet a duplicate within the SAME row — even case-vary'd — + // still trips the per-row duplicate guard. + col3, _ := tb.getOrCreateColumn("PRESSURE_S", qwpTypeDouble, false) + col3.addDouble(3.5) + _, err = tb.getOrCreateColumn("pressure_s", qwpTypeDouble, false) + if err == nil { + t.Fatal("expected per-row duplicate error for case-vary'd second write") + } + }) + + t.Run("MixedCaseCursorResync", func(t *testing.T) { + // After a sparse skip forces the sequential cursor off the + // fast path, a map hit resyncs the cursor to idx+1 so the rest + // of the row's columns resolve on the fast path again — even + // when each is written with a different ASCII casing. + tb := newQwpTableBuffer("t") + for i, n := range []string{"Aa", "Bb", "Cc", "Dd"} { + col, err := tb.getOrCreateColumn(n, qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + col.addLong(int64(i)) + } + tb.commitRow() // cursor reset to 0 + + // "aA" hits the ASCII-fold fast path (cursor 0 → 1). + if _, err := tb.getOrCreateColumn("aA", qwpTypeLong, false); err != nil { + t.Fatal(err) + } + if tb.columnAccessCursor != 1 { + t.Fatalf("cursor = %d after fold-match on idx 0, want 1", tb.columnAccessCursor) + } + // Skip "Bb": "cC" misses the cursor (it points at Bb), resolves + // via the map to column 2, and resyncs the cursor to 3. + c, err := tb.getOrCreateColumn("cC", qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + if c != tb.columns[2] { + t.Fatal("cC resolved to the wrong column") + } + if tb.columnAccessCursor != 3 { + t.Fatalf("cursor = %d after map hit on idx 2, want 3 (resync)", tb.columnAccessCursor) + } + // "dD" now hits the resynced fast path (cursor 3 → 4). + d, err := tb.getOrCreateColumn("dD", qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + if d != tb.columns[3] { + t.Fatal("dD resolved to the wrong column") + } + if tb.columnAccessCursor != 4 { + t.Fatalf("cursor = %d after fold-match on idx 3, want 4", tb.columnAccessCursor) + } + if len(tb.columns) != 4 { + t.Fatalf("columns len = %d, want 4 (no parallel case-vary'd columns)", len(tb.columns)) + } + }) + + t.Run("MixedCaseAliasClearedOnCancel", func(t *testing.T) { + // A casing-variant alias memoized into columnIndex maps to a + // column index. cancelRow must drop it when it removes the + // column — otherwise a later lookup dereferences a dangling + // index. The reset() case is the sharp edge: it retains columns + // but zeroes committedColumnCount, so the next cancelRow removes + // every column. + tb := newQwpTableBuffer("t") + c0, err := tb.getOrCreateColumn("Aa", qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + c0.addLong(1) + tb.commitRow() + + // Force "aA" off the fast path so it resolves via the map and + // memoizes a casing-variant alias of the canonical key "aa". + tb.columnAccessCursor = 1 + if _, err := tb.getOrCreateColumn("aA", qwpTypeLong, false); err != nil { + t.Fatal(err) + } + if _, ok := tb.columnIndex["aA"]; !ok { + t.Fatal("expected memoized alias key \"aA\"") + } + + // reset() keeps the column but zeroes committedColumnCount, so + // the partial row below plus cancelRow wipes every column. + tb.reset() + nb, err := tb.getOrCreateColumn("Bb", qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + nb.addLong(2) + tb.cancelRow() + + if len(tb.aliasKeys) != 0 { + t.Fatalf("aliasKeys = %v, want empty after wipe-cancel", tb.aliasKeys) + } + if _, ok := tb.columnIndex["aA"]; ok { + t.Fatal("stale alias \"aA\" survived cancelRow that wiped its column") + } + if len(tb.columns) != 0 { + t.Fatalf("columns len = %d, want 0 after wipe-cancel", len(tb.columns)) + } + + // Re-adding by the aliased casing must create a fresh column, + // not index past the emptied slice via the dropped alias. + re, err := tb.getOrCreateColumn("aA", qwpTypeLong, false) + if err != nil { + t.Fatal(err) + } + re.addLong(3) + if tb.columnIndex["aa"] != 0 || tb.columns[0] != re { + t.Fatal("re-added column not registered at the canonical key") + } + }) + t.Run("BackfillOnCreate", func(t *testing.T) { tb := newQwpTableBuffer("t") @@ -1206,46 +1357,6 @@ func TestQwpTableBufferReset(t *testing.T) { } } -func TestQwpTableBufferSchemaId(t *testing.T) { - t.Run("UnassignedByDefault", func(t *testing.T) { - tb := newQwpTableBuffer("t") - if tb.schemaId != -1 { - t.Fatalf("new table schemaId = %d, want -1", tb.schemaId) - } - }) - - t.Run("InvalidatedOnNewColumn", func(t *testing.T) { - tb := newQwpTableBuffer("t") - col, _ := tb.getOrCreateColumn("a", qwpTypeLong, false) - col.addLong(1) - tb.commitRow() - - // Sender would have assigned an ID at this point. - tb.schemaId = 7 - - if _, err := tb.getOrCreateColumn("b", qwpTypeDouble, false); err != nil { - t.Fatal(err) - } - if tb.schemaId != -1 { - t.Fatalf("schemaId = %d after column add, want -1", tb.schemaId) - } - }) - - t.Run("PreservedAcrossReset", func(t *testing.T) { - tb := newQwpTableBuffer("t") - col, _ := tb.getOrCreateColumn("a", qwpTypeLong, false) - col.addLong(1) - tb.commitRow() - - tb.schemaId = 3 - tb.reset() - - if tb.schemaId != 3 { - t.Fatalf("schemaId = %d after reset, want 3 (column set unchanged)", tb.schemaId) - } - }) -} - // --- array column buffer tests --- func TestQwpColumnBufferDoubleArray1D(t *testing.T) { @@ -1500,6 +1611,31 @@ func TestQwpColumnBufferArrayNull(t *testing.T) { } }) + t.Run("DoubleArrayNonNullablePanics", func(t *testing.T) { + // The wire format has no inline NULL sentinel for arrays, so + // addNull on a non-nullable array column has no valid + // encoding. The public API never produces this shape (array + // columns are always nullable), so this is purely a guard + // against misuse of the low-level buffer constructor. + c := newQwpColumnBuffer("col", qwpTypeDoubleArray, false) + defer func() { + if r := recover(); r == nil { + t.Fatalf("expected panic, got none") + } + }() + c.addNull() + }) + + t.Run("LongArrayNonNullablePanics", func(t *testing.T) { + c := newQwpColumnBuffer("col", qwpTypeLongArray, false) + defer func() { + if r := recover(); r == nil { + t.Fatalf("expected panic, got none") + } + }() + c.addNull() + }) + t.Run("InterleavedNullAndData", func(t *testing.T) { c := newQwpColumnBuffer("col", qwpTypeDoubleArray, true) c.addDoubleArray(1, []int32{2}, []float64{1.0, 2.0}) // row 0: 21 bytes diff --git a/qwp_constants.go b/qwp_constants.go index 4c5af170..2889a5e2 100644 --- a/qwp_constants.go +++ b/qwp_constants.go @@ -24,13 +24,16 @@ package questdb -import "time" +import ( + "fmt" + "time" +) // qwpTypeCode represents a QWP column type. type qwpTypeCode byte // QWP column type codes. Each type has a specific wire encoding -// defined in the QWP v1 protocol specification. +// defined in the QWP protocol specification. const ( qwpTypeBoolean qwpTypeCode = 0x01 // bit-packed, 1 bit per value qwpTypeByte qwpTypeCode = 0x02 // int8, 1 byte @@ -44,7 +47,7 @@ const ( // same wire encoding. Do not reuse this code. qwpTypeSymbol qwpTypeCode = 0x09 // variable, dictionary-encoded qwpTypeTimestamp qwpTypeCode = 0x0A // int64 microseconds, 8 bytes LE - qwpTypeDate qwpTypeCode = 0x0B // int64 milliseconds, 8 bytes LE + qwpTypeDate qwpTypeCode = 0x0B // int64 ms. Asymmetric: ingestion=plain int64; egress=timestamp-ish framing (enc byte + RAW/Gorilla, like qwpTypeTimestamp) qwpTypeUuid qwpTypeCode = 0x0C // 16 bytes (lo then hi, LE) qwpTypeLong256 qwpTypeCode = 0x0D // 32 bytes (four int64s, LE) qwpTypeGeohash qwpTypeCode = 0x0E // varint precision + packed bits @@ -56,15 +59,147 @@ const ( qwpTypeDecimal128 qwpTypeCode = 0x14 // 16 bytes, little-endian unscaled qwpTypeDecimal256 qwpTypeCode = 0x15 // 32 bytes, little-endian unscaled qwpTypeChar qwpTypeCode = 0x16 // UTF-16 code unit, 2 bytes LE + // Decoder-only types: the Go encoder never emits them, but the + // egress `RESULT_BATCH` decoder must handle columns the server + // produces from arbitrary SELECTs (pg_catalog views, IP lookups, + // binary columns, etc.). + qwpTypeBinary qwpTypeCode = 0x17 // variable, offset+data (same layout as VARCHAR) + qwpTypeIPv4 qwpTypeCode = 0x18 // 4 bytes LE, identical to INT +) + +// Exported column-type codes for QwpColumnBatch.ColumnType. Each value +// is the wire-type byte the egress decoder reports for a column; switch +// on ColumnType(col) to choose the matching typed accessor. The values +// mirror the QWP protocol type codes. Decoder-only types (Binary, IPv4) +// are included because a SELECT can surface them even though the encoder +// never emits them. +const ( + QwpTypeBoolean = byte(qwpTypeBoolean) + QwpTypeByte = byte(qwpTypeByte) + QwpTypeShort = byte(qwpTypeShort) + QwpTypeInt = byte(qwpTypeInt) + QwpTypeLong = byte(qwpTypeLong) + QwpTypeFloat = byte(qwpTypeFloat) + QwpTypeDouble = byte(qwpTypeDouble) + QwpTypeSymbol = byte(qwpTypeSymbol) + QwpTypeTimestamp = byte(qwpTypeTimestamp) + QwpTypeDate = byte(qwpTypeDate) + QwpTypeUuid = byte(qwpTypeUuid) + QwpTypeLong256 = byte(qwpTypeLong256) + QwpTypeGeohash = byte(qwpTypeGeohash) + QwpTypeVarchar = byte(qwpTypeVarchar) + QwpTypeTimestampNano = byte(qwpTypeTimestampNano) + QwpTypeDoubleArray = byte(qwpTypeDoubleArray) + QwpTypeLongArray = byte(qwpTypeLongArray) + QwpTypeDecimal64 = byte(qwpTypeDecimal64) + QwpTypeDecimal128 = byte(qwpTypeDecimal128) + QwpTypeDecimal256 = byte(qwpTypeDecimal256) + QwpTypeChar = byte(qwpTypeChar) + QwpTypeBinary = byte(qwpTypeBinary) + QwpTypeIPv4 = byte(qwpTypeIPv4) +) + +// qwpMsgKind is the one-byte discriminator at the start of every QWP +// egress payload (spec §5). Ingress DATA_BATCH messages use 0x00; the +// 0x10..0x17 range is reserved for egress request/response kinds. +type qwpMsgKind byte + +const ( + qwpMsgKindDataBatch qwpMsgKind = 0x00 + qwpMsgKindResponse qwpMsgKind = 0x01 + qwpMsgKindQueryRequest qwpMsgKind = 0x10 + qwpMsgKindResultBatch qwpMsgKind = 0x11 + qwpMsgKindResultEnd qwpMsgKind = 0x12 + qwpMsgKindQueryError qwpMsgKind = 0x13 + qwpMsgKindCancel qwpMsgKind = 0x14 + qwpMsgKindCredit qwpMsgKind = 0x15 + qwpMsgKindExecDone qwpMsgKind = 0x16 + // qwpMsgKindCacheReset is a server → client connection-scoped + // cache-reset notification. Body is a single reset_mask byte (see + // qwpResetMask* below) whose bits tell the client which caches to + // discard. Sent between queries when a cache reaches the server's + // configured soft cap; after applying, the next RESULT_BATCH's + // delta-dict deltaStart is expected to line up with a fresh server + // counter. Does not surface to users. + qwpMsgKindCacheReset qwpMsgKind = 0x17 + // qwpMsgKindServerInfo is the unsolicited server → client frame + // the server emits as the first WebSocket frame after the upgrade, + // before any client request. Body (little-endian, after the + // 12-byte QWP header): + // role(u8) + epoch(u64) + capabilities(u32) + server_wall_ns(i64) + // + cluster_id(u16_len + utf8) + node_id(u16_len + utf8). The + // server always emits it post-upgrade; ingest senders simply do + // not read it. The byte 0x18 is also bound to qwpTypeIPv4 in the + // qwpTypeCode enum; no collision since the two are distinct types. + qwpMsgKindServerInfo qwpMsgKind = 0x18 +) + +// SERVER_INFO role byte values (spec §11.8). Mirror Java +// QwpEgressMsgKind.ROLE_*. +const ( + // qwpRoleStandalone marks a node with no replication configured. + // OSS single-node default; behaves like a primary for routing + // purposes and is accepted by target=primary. + qwpRoleStandalone byte = 0x00 + // qwpRolePrimary is the authoritative write node; reads see latest + // commits. + qwpRolePrimary byte = 0x01 + // qwpRoleReplica is read-only and may lag the primary by up to the + // replication poll interval. + qwpRoleReplica byte = 0x02 + // qwpRolePrimaryCatchup signals a promotion in flight; behaves like + // a primary but is still uploading in-flight segments. Accepted by + // target=primary. + qwpRolePrimaryCatchup byte = 0x03 +) + +// Exported SERVER_INFO role codes for QwpServerInfo.Role. Compare Role +// against these or call QwpServerInfo.RoleName for a human-readable form. +const ( + QwpRoleStandalone = qwpRoleStandalone + QwpRolePrimary = qwpRolePrimary + QwpRoleReplica = qwpRoleReplica + QwpRolePrimaryCatchup = qwpRolePrimaryCatchup +) + +// Bit flags carried in the reset_mask byte of a CACHE_RESET frame. +// Mirrors the Java QwpEgressMsgKind.RESET_MASK_* constants. +const ( + // qwpResetMaskDict clears the connection-scoped SYMBOL dict. After + // applying, the next RESULT_BATCH's delta section must start at + // deltaStart=0 — i.e. the server has also reset its dict to empty. + qwpResetMaskDict byte = 0x01 ) // qwpMagic is the 4-byte magic at the start of every QWP message. // Stored as a uint32 in little-endian byte order: "QWP1". const qwpMagic uint32 = 0x31505751 -// qwpVersion is the current protocol version. +// qwpVersion is the sole QWP protocol version. It is stamped into the +// 12-byte header of every frame this client encodes and advertised +// verbatim in the X-QWP-Max-Version handshake header on both the +// ingest and egress paths. The server echoes min(server_max, +// client_max) back as X-QWP-Version; decoders then enforce strict +// equality between every server frame's header version byte and the +// negotiated version (spec §3). The negotiation mechanism is retained +// so a future version bump has somewhere to grow, but today exactly +// one version exists. const qwpVersion byte = 0x01 +// qwpCapZone is the CAP_ZONE bit in SERVER_INFO.capabilities. When +// set, the server's SERVER_INFO frame carries an additional +// zone_id string after node_id; clients use it to drive the +// failover.md §2 zone-tier classification (Same / Unknown / Other). +// Absent CAP_ZONE leaves the host's zone tier at Unknown, which +// PickNext treats as a middle-priority bucket between Same and +// Other. +const qwpCapZone uint32 = 1 << 0 + +// QwpCapZone is the exported CAP_ZONE bit for QwpServerInfo.Capabilities. +// When set, the server advertised a zone_id (surfaced as +// QwpServerInfo.ZoneId). +const QwpCapZone = qwpCapZone + // QWP message header layout. const ( qwpHeaderSize = 12 @@ -77,26 +212,26 @@ const ( const ( qwpFlagGorilla byte = 0x04 // Gorilla timestamp encoding qwpFlagDeltaSymbolDict byte = 0x08 // delta symbol dictionary + qwpFlagZstd byte = 0x10 // payload after prelude is zstd-compressed (egress only) ) -// qwpSchemaMode values control how column schema is transmitted. -type qwpSchemaMode byte - -const ( - qwpSchemaModeFull qwpSchemaMode = 0x00 // full column definitions - qwpSchemaModeReference qwpSchemaMode = 0x01 // reference a schema already registered by ID -) - -// qwpStatusCode represents a server response status. -type qwpStatusCode byte +// QwpStatusCode represents a server response status. The byte value is +// stable on the QWP wire and is preserved on SenderError.ServerStatusByte +// for cross-language debugging; the recommended way to discriminate +// rejections is the higher-level Category enum. +type QwpStatusCode byte const ( - qwpStatusOK qwpStatusCode = 0x00 // batch accepted - qwpStatusSchemaMismatch qwpStatusCode = 0x03 // column type incompatible with existing table - qwpStatusParseError qwpStatusCode = 0x05 // malformed message - qwpStatusInternalError qwpStatusCode = 0x06 // server-side error - qwpStatusSecurityError qwpStatusCode = 0x08 // authorization failure - qwpStatusWriteError qwpStatusCode = 0x09 // write failure (e.g., table not accepting writes) + QwpStatusOK QwpStatusCode = 0x00 // batch accepted + QwpStatusDurableAck QwpStatusCode = 0x02 // per-table durable-upload ACK (replication primaries opted-in) + QwpStatusSchemaMismatch QwpStatusCode = 0x03 // column type incompatible with existing table + QwpStatusParseError QwpStatusCode = 0x05 // malformed message + QwpStatusInternalError QwpStatusCode = 0x06 // server-side error + QwpStatusSecurityError QwpStatusCode = 0x08 // authorization failure + QwpStatusWriteError QwpStatusCode = 0x09 // write failure (e.g., table not accepting writes) + // Egress-specific status codes (spec §15). + qwpStatusCancelled QwpStatusCode = 0x0A // query terminated in response to CANCEL + qwpStatusLimitExceeded QwpStatusCode = 0x0B // a protocol limit was hit ) // QWP sender defaults and limits. @@ -113,19 +248,37 @@ const ( // Java: QwpWebSocketSender.DEFAULT_AUTO_FLUSH_ROWS = 1_000. qwpDefaultAutoFlushRows = 1_000 - // qwpDefaultInFlightWindow is the default maximum number of batches - // that may be outstanding (unacked) in async mode. - // Java: QwpWebSocketSender.DEFAULT_IN_FLIGHT_WINDOW_SIZE = 128. - qwpDefaultInFlightWindow = 128 - - // qwpDefaultMaxSchemasPerConnection caps the schema cache per - // connection; callers may recycle the connection on overflow. - // Java: QwpWebSocketSender.DEFAULT_MAX_SCHEMAS_PER_CONNECTION = 65_535. - qwpDefaultMaxSchemasPerConnection = 65_535 + // qwpDefaultAutoFlushBytes is the byte-size trigger for auto-flush. + // connect-string.md §Auto-flushing: "Default where supported: `8m` + // (8 MiB)". Mirrors Java's DEFAULT_AUTO_FLUSH_BYTES. The effective + // threshold the sender compares pendingBytes against is clamped + // down to 90% of two limits — see qwpLineSender.applyServerBatchSizeLimit: + // - the server-advertised X-QWP-Max-Batch-Size, re-evaluated on + // every successful connect (initial bind and every reconnect); and + // - the per-segment frame cap (maxFrameBytes), fixed at construction + // from the cursor engine's segment size. Without this term the + // shipped defaults (8 MiB trigger over a 4 MiB segment) would let + // a batch grow past what a segment can hold and wedge on flush. + // The clamp only reduces: a configured value below both caps is kept + // as-is, and an explicit user opt-out (auto_flush_bytes=off / =0) is + // preserved even when a cap applies. + // + // Hard guards back the soft clamp in enqueueCursor / atWithTimestamp, + // rejecting or splitting with a typed error before an over-cap frame + // leaves the process: a per-row guard (any single row above the + // server cap) rejects at At() time, and a flush-time cap check + // (against the server cap and the per-segment frame cap) re-encodes + // the batch one table per frame, flushing every table that fits on + // its own and dropping only a table that is individually over-cap. + // Both fire even when the user opted out of byte-size auto-flush. + qwpDefaultAutoFlushBytes = 8 * 1024 * 1024 - // qwpDefaultInitEncoderBufSize is the initial encoder buffer size. - // Java: QwpWebSocketSender.DEFAULT_BUFFER_SIZE = 8192. - qwpDefaultInitEncoderBufSize = 8 * 1024 // 8 KB + // qwpDefaultInFlightWindow seeds in_flight_window for Java-parity + // config compatibility. The cursor architecture ignores it — + // backpressure is governed by the engine's segment ring and append + // deadline (see WithInFlightWindow). Java: + // QwpWebSocketSender.DEFAULT_IN_FLIGHT_WINDOW_SIZE = 128. + qwpDefaultInFlightWindow = 128 // qwpDefaultMicrobatchBufSize is the per-encoder microbatch buffer // size used to coalesce rows before a WebSocket frame is sent. @@ -136,6 +289,19 @@ const ( // client does not enforce a hard cap. qwpMaxColumnsPerTable = 2048 + // qwpMaxBindsPerQuery caps bind parameters per QUERY_REQUEST. + // Spec §16. The server enforces this independently; the client-side + // preflight surfaces a typed error before bytes leave the process. + // Distinct from qwpMaxColumnsPerTable (an ingest concept) — egress + // QUERY_REQUEST and ingest DATA_BATCH have independent limits. + qwpMaxBindsPerQuery = 1024 + + // qwpMaxSqlTextBytes caps the UTF-8 byte length of the sql_bytes + // field in a QUERY_REQUEST. Spec §16 pins this at 1 MiB. The server + // enforces this independently; the client-side preflight produces a + // friendlier error and avoids serializing a doomed payload. + qwpMaxSqlTextBytes = 1 << 20 + // qwpMaxTablesPerBatch is the hard upper bound on distinct tables // in a single QWP message: the wire format encodes the table count // as uint16. @@ -145,6 +311,83 @@ const ( // receive buffer. Go-only; the Java client manages the read path // differently and has no direct counterpart. qwpDefaultInitRecvBufSize = 64 * 1024 // 64 KB + + // Hardening caps used by the egress `RESULT_BATCH` decoder. Match + // the Java reference decoder (QwpResultBatchDecoder.java) so hostile + // or buggy server frames that advertise out-of-range dimensions are + // rejected before any large allocation. + + // qwpMaxBatchSize is the headline protocol cap on a single + // RESULT_BATCH frame's wire size, in bytes. Spec §14 "Protocol + // Limits" pins this at 16 MiB; the Java server enforces the same + // value via QwpConstants.DEFAULT_MAX_BATCH_SIZE. Acts as a direct + // upper bound checked before per-section bounds (row count, column + // count, dict heap, zstd content size) come into play — those + // remain as defense-in-depth, but the single cap is the spec-level + // limit a conformant server stays under. + qwpMaxBatchSize = 16 * 1024 * 1024 + qwpMaxRowsPerBatch = 1_048_576 // per-batch row cap + qwpMaxTableNameLen = 127 // UTF-8 bytes + qwpMaxColumnNameLen = 127 // UTF-8 bytes + qwpMaxArrayNDims = 32 // max array dimensionality; matches Java reference + // qwpMaxArrayElements caps the element count of a single ARRAY cell + // so that element-count * 8 (element stride) plus the per-row shape + // header (up to qwpMaxArrayNDims * 4 bytes) together stay inside + // int32. The 1024-byte slack covers that shape header. + qwpMaxArrayElements = (1<<31 - 1 - 1024) / 8 + + // qwpMaxCellsPerBatch caps the declared cell count (row_count × + // column_count) of one RESULT_BATCH. The decoder materialises a + // row-indexed scratch array — rowCount entries wide — for every + // column that carries nulls (nonNullIdx) and for every SYMBOL + // (symbolRowIds) and ARRAY (arrayRowStart + arrayElems) column, so a + // single column costs 4..12 bytes of heap per row. An all-null column + // is nearly free on the wire — a rowCount/8 null bitmap that + // zstd-compresses to almost nothing — yet still forces that full + // rowCount-sized allocation: a 32–96× amplification. A frame packed + // with such columns up to the decompressed-frame cap would otherwise + // drive multi-GiB transient `make`s. A conformant server spends at + // least one wire byte per cell, so a legitimate batch never declares + // more cells than its maximum possible decompressed byte size. Tying + // the cap to qwpZstdMaxDecompressedSize rejects amplified frames up + // front — before the per-column loop sizes any index array — while + // clearing every batch a real server emits. + qwpMaxCellsPerBatch = qwpZstdMaxDecompressedSize + + // qwpReadLimitSlack is headroom added on top of qwpMaxBatchSize when + // arming the WebSocket read limit. coder/websocket's limitReader + // trips ErrMessageTooBig the moment its byte budget reaches zero — + // before the terminal io.EOF is delivered — so a legitimate frame of + // exactly qwpMaxBatchSize would be rejected without this margin (the + // library applies the same +1 trick to its own default limit). The + // band between qwpMaxBatchSize and the limit is never a valid frame: + // the egress decoder rejects RESULT_BATCH payloads > qwpMaxBatchSize, + // and every ACK / SERVER_INFO frame is far smaller. + qwpReadLimitSlack = 4096 + + // qwpMaxFrameReadLimit is the hard ceiling on a single inbound + // WebSocket message. Egress RESULT_BATCH / SERVER_INFO and ingest + // ACK frames share one connection, so this single cap covers both. + // Armed via Conn.SetReadLimit so a hostile or buggy server cannot + // OOM the host with a multi-GB frame: the limit is enforced *during* + // the streamed read, before the whole message is resident, rather + // than only after — qwpMaxBatchSize alone is checked post-assembly + // by the decoder and not at all on the readAck path. It also caps + // qwpReadFrameInto's buffer doubling as defense-in-depth. + qwpMaxFrameReadLimit = qwpMaxBatchSize + qwpReadLimitSlack + + // qwpMaxConnDictHeapBytes caps the connection-scoped SYMBOL dict + // UTF-8 heap at 256 MiB. Servers that approach this cap are + // expected to emit CACHE_RESET; crossing it without a reset is a + // misbehaving (or hostile) server. Below uint32 max so the + // uint32 offsets stored on each entry cannot wrap. Mirrors Java + // QwpResultBatchDecoder.MAX_CONN_DICT_HEAP_BYTES. + qwpMaxConnDictHeapBytes = 256 * 1024 * 1024 + + // qwpMaxConnDictSize caps the connection-scoped SYMBOL dict entry + // count. Mirrors Java QwpResultBatchDecoder.MAX_CONN_DICT_SIZE + // (2^23) — same defensive intent as the heap cap. + qwpMaxConnDictSize = 8_388_608 ) // qwpFixedTypeSize returns the per-value size in bytes for fixed-width @@ -158,7 +401,7 @@ func qwpFixedTypeSize(tc qwpTypeCode) int { return 1 case qwpTypeShort, qwpTypeChar: return 2 - case qwpTypeInt, qwpTypeFloat: + case qwpTypeInt, qwpTypeFloat, qwpTypeIPv4: return 4 case qwpTypeLong, qwpTypeDouble, qwpTypeTimestamp, qwpTypeDate, qwpTypeTimestampNano, qwpTypeDecimal64: @@ -167,10 +410,76 @@ func qwpFixedTypeSize(tc qwpTypeCode) int { return 16 case qwpTypeLong256, qwpTypeDecimal256: return 32 - case qwpTypeSymbol, qwpTypeVarchar, + case qwpTypeSymbol, qwpTypeVarchar, qwpTypeBinary, qwpTypeGeohash, qwpTypeDoubleArray, qwpTypeLongArray: return -1 // variable-width default: return -1 } } + +// qwpIsArrayType reports whether tc is one of the N-dimensional array +// types (DOUBLE_ARRAY / LONG_ARRAY). The array accessors index the +// decoder's per-array arrayRowStart / arrayElems side tables, which the +// decoder populates only for these two types; the bulk array accessors +// guard on this so a mis-typed call panics with a clear message rather +// than an opaque slice-bounds error. +func qwpIsArrayType(tc qwpTypeCode) bool { + return tc == qwpTypeDoubleArray || tc == qwpTypeLongArray +} + +// qwpTypeName returns the protocol name of a wire type for diagnostics +// and panic messages. Unknown codes — including 0x08, the removed +// TYPE_STRING — render as their hex byte. +func qwpTypeName(tc qwpTypeCode) string { + switch tc { + case qwpTypeBoolean: + return "BOOLEAN" + case qwpTypeByte: + return "BYTE" + case qwpTypeShort: + return "SHORT" + case qwpTypeInt: + return "INT" + case qwpTypeLong: + return "LONG" + case qwpTypeFloat: + return "FLOAT" + case qwpTypeDouble: + return "DOUBLE" + case qwpTypeSymbol: + return "SYMBOL" + case qwpTypeTimestamp: + return "TIMESTAMP" + case qwpTypeDate: + return "DATE" + case qwpTypeUuid: + return "UUID" + case qwpTypeLong256: + return "LONG256" + case qwpTypeGeohash: + return "GEOHASH" + case qwpTypeVarchar: + return "VARCHAR" + case qwpTypeTimestampNano: + return "TIMESTAMP_NANOS" + case qwpTypeDoubleArray: + return "DOUBLE_ARRAY" + case qwpTypeLongArray: + return "LONG_ARRAY" + case qwpTypeDecimal64: + return "DECIMAL64" + case qwpTypeDecimal128: + return "DECIMAL128" + case qwpTypeDecimal256: + return "DECIMAL256" + case qwpTypeChar: + return "CHAR" + case qwpTypeBinary: + return "BINARY" + case qwpTypeIPv4: + return "IPv4" + default: + return fmt.Sprintf("0x%02x", byte(tc)) + } +} diff --git a/qwp_constants_test.go b/qwp_constants_test.go index 9b07309c..4429d65a 100644 --- a/qwp_constants_test.go +++ b/qwp_constants_test.go @@ -33,12 +33,6 @@ import ( // implementations stay in lockstep on wire-protocol constants. func TestQwpMagicBytesValue(t *testing.T) { - // "QWP1" in ASCII: Q=0x51, W=0x57, P=0x50, 1=0x31 - // Stored as uint32 in little-endian: 0x31505751 - if qwpMagic != 0x31505751 { - t.Fatalf("qwpMagic = 0x%08X, want 0x31505751", qwpMagic) - } - var buf [4]byte binary.LittleEndian.PutUint32(buf[:], qwpMagic) if buf != [4]byte{'Q', 'W', 'P', '1'} { @@ -46,68 +40,21 @@ func TestQwpMagicBytesValue(t *testing.T) { } } -func TestQwpHeaderSize(t *testing.T) { - if qwpHeaderSize != 12 { - t.Fatalf("qwpHeaderSize = %d, want 12", qwpHeaderSize) - } -} - -func TestQwpHeaderFieldOffsets(t *testing.T) { - // Magic occupies offsets [0..4), version at 4. Then flags, table - // count, payload length at documented offsets. - if qwpHeaderOffsetFlags != 5 { - t.Fatalf("qwpHeaderOffsetFlags = %d, want 5", qwpHeaderOffsetFlags) - } - if qwpHeaderOffsetTableCount != 6 { - t.Fatalf("qwpHeaderOffsetTableCount = %d, want 6", qwpHeaderOffsetTableCount) - } - if qwpHeaderOffsetPayloadLen != 8 { - t.Fatalf("qwpHeaderOffsetPayloadLen = %d, want 8", qwpHeaderOffsetPayloadLen) - } -} - -func TestQwpVersion(t *testing.T) { - if qwpVersion != 0x01 { - t.Fatalf("qwpVersion = 0x%02X, want 0x01", qwpVersion) - } -} - -func TestQwpFlagBitPositions(t *testing.T) { - if qwpFlagGorilla != 0x04 { - t.Fatalf("qwpFlagGorilla = 0x%02X, want 0x04", qwpFlagGorilla) - } - if qwpFlagDeltaSymbolDict != 0x08 { - t.Fatalf("qwpFlagDeltaSymbolDict = 0x%02X, want 0x08", qwpFlagDeltaSymbolDict) - } - // Flags are independent bits, so OR'ing them yields both set. - if qwpFlagGorilla&qwpFlagDeltaSymbolDict != 0 { - t.Fatalf("flag bits overlap: gorilla=0x%02X, deltaDict=0x%02X", - qwpFlagGorilla, qwpFlagDeltaSymbolDict) - } -} - -func TestQwpSchemaModes(t *testing.T) { - if qwpSchemaModeFull != 0x00 { - t.Fatalf("qwpSchemaModeFull = 0x%02X, want 0x00", qwpSchemaModeFull) - } - if qwpSchemaModeReference != 0x01 { - t.Fatalf("qwpSchemaModeReference = 0x%02X, want 0x01", qwpSchemaModeReference) - } -} - func TestQwpStatusCodes(t *testing.T) { // ACK status codes the server emits. These must match the Java - // reference so QwpError classification stays correct. + // reference so SenderError classification stays correct. cases := []struct { - code qwpStatusCode + code QwpStatusCode want byte }{ - {qwpStatusOK, 0x00}, - {qwpStatusSchemaMismatch, 0x03}, - {qwpStatusParseError, 0x05}, - {qwpStatusInternalError, 0x06}, - {qwpStatusSecurityError, 0x08}, - {qwpStatusWriteError, 0x09}, + {QwpStatusOK, 0x00}, + {QwpStatusSchemaMismatch, 0x03}, + {QwpStatusParseError, 0x05}, + {QwpStatusInternalError, 0x06}, + {QwpStatusSecurityError, 0x08}, + {QwpStatusWriteError, 0x09}, + {qwpStatusCancelled, 0x0A}, + {qwpStatusLimitExceeded, 0x0B}, } for _, c := range cases { if byte(c.code) != c.want { @@ -145,6 +92,8 @@ func TestQwpTypeCodes(t *testing.T) { {qwpTypeDecimal128, 0x14}, {qwpTypeDecimal256, 0x15}, {qwpTypeChar, 0x16}, + {qwpTypeBinary, 0x17}, + {qwpTypeIPv4, 0x18}, } for _, c := range cases { if byte(c.tc) != c.want { @@ -153,6 +102,58 @@ func TestQwpTypeCodes(t *testing.T) { } } +func TestQwpOpTypeCodes(t *testing.T) { + // Exported ExecResult.OpType codes, pinned to the server's + // CompiledQuery.TYPE_* discriminators. These are a cross-protocol + // wire contract (PG wire + QWP EXEC_DONE), so a drift here is a + // client/server mismatch, not a cosmetic change. + cases := []struct { + op byte + want byte + }{ + {QwpOpTypeInsert, 2}, + {QwpOpTypeTruncate, 3}, + {QwpOpTypeAlter, 4}, + {QwpOpTypeDrop, 7}, + {QwpOpTypeCreateTable, 9}, + {QwpOpTypeInsertAsSelect, 10}, + {QwpOpTypeRenameTable, 12}, + {QwpOpTypeUpdate, 14}, + {QwpOpTypeCreateTableAsSelect, 21}, + } + for _, c := range cases { + if c.op != c.want { + t.Errorf("op type %d, want %d", c.op, c.want) + } + } +} + +func TestQwpMsgKinds(t *testing.T) { + // Egress message-kind discriminators (spec §5). Values here are + // the wire bytes the egress server sends and the Go client must + // dispatch on; they must match the Java QwpEgressMsgKind constants. + cases := []struct { + kind qwpMsgKind + want byte + }{ + {qwpMsgKindDataBatch, 0x00}, + {qwpMsgKindResponse, 0x01}, + {qwpMsgKindQueryRequest, 0x10}, + {qwpMsgKindResultBatch, 0x11}, + {qwpMsgKindResultEnd, 0x12}, + {qwpMsgKindQueryError, 0x13}, + {qwpMsgKindCancel, 0x14}, + {qwpMsgKindCredit, 0x15}, + {qwpMsgKindExecDone, 0x16}, + {qwpMsgKindCacheReset, 0x17}, + } + for _, c := range cases { + if byte(c.kind) != c.want { + t.Errorf("msg kind 0x%02X, want 0x%02X", byte(c.kind), c.want) + } + } +} + func TestQwpFixedTypeSize(t *testing.T) { cases := []struct { tc qwpTypeCode @@ -164,6 +165,7 @@ func TestQwpFixedTypeSize(t *testing.T) { {qwpTypeChar, 2}, {qwpTypeInt, 4}, {qwpTypeFloat, 4}, + {qwpTypeIPv4, 4}, {qwpTypeLong, 8}, {qwpTypeDouble, 8}, {qwpTypeTimestamp, 8}, @@ -177,6 +179,7 @@ func TestQwpFixedTypeSize(t *testing.T) { // Variable-width types report -1. {qwpTypeSymbol, -1}, {qwpTypeVarchar, -1}, + {qwpTypeBinary, -1}, {qwpTypeGeohash, -1}, {qwpTypeDoubleArray, -1}, {qwpTypeLongArray, -1}, @@ -188,35 +191,108 @@ func TestQwpFixedTypeSize(t *testing.T) { } } -func TestQwpMaxTablesPerBatch(t *testing.T) { - // The table count field in the header is a uint16, so the max - // addressable tables per batch is 0xFFFF. - if qwpMaxTablesPerBatch != 0xFFFF { - t.Fatalf("qwpMaxTablesPerBatch = %d, want 65535", qwpMaxTablesPerBatch) +func TestQwpLongNullSentinel(t *testing.T) { + // Int64 MinInt64 as uint64 — used as the null sentinel for + // non-nullable LONG/TIMESTAMP/DATE/UUID/LONG256 columns. + if qwpLongNull != 0x8000000000000000 { + t.Fatalf("qwpLongNull = 0x%016X, want 0x8000000000000000", qwpLongNull) + } +} + +func TestQwpFlagBitPositions(t *testing.T) { + // Header flag bits. Drift here is a wire-format break — the + // server uses these exact bits to signal Gorilla / delta-dict / + // zstd payload encoding. Mirrors Java's QwpConstantsTest + // testFlagBitPositions. + if qwpFlagGorilla != 0x04 { + t.Errorf("qwpFlagGorilla = 0x%02X, want 0x04", qwpFlagGorilla) + } + if qwpFlagDeltaSymbolDict != 0x08 { + t.Errorf("qwpFlagDeltaSymbolDict = 0x%02X, want 0x08", qwpFlagDeltaSymbolDict) + } + // qwpFlagZstd is Go-side specific (the egress server uses it for + // RESULT_BATCH compression). Pinned to catch silent drift. + if qwpFlagZstd != 0x10 { + t.Errorf("qwpFlagZstd = 0x%02X, want 0x10", qwpFlagZstd) + } +} + +func TestQwpHeaderSize(t *testing.T) { + // 12-byte header: 4 magic + 1 version + 2 reserved + 1 flags + // + 4 payload-length. Drift here means the encoder and the + // decoder won't agree on where the payload starts. Mirrors + // Java's QwpConstantsTest testHeaderSize. + if qwpHeaderSize != 12 { + t.Errorf("qwpHeaderSize = %d, want 12", qwpHeaderSize) + } + // Pin the offsets the decoder actually reaches into too — a + // reorganised header that kept the size but moved the flags or + // payload-length fields would slip past the size check above. + if qwpHeaderOffsetFlags != 5 { + t.Errorf("qwpHeaderOffsetFlags = %d, want 5", qwpHeaderOffsetFlags) + } + if qwpHeaderOffsetPayloadLen != 8 { + t.Errorf("qwpHeaderOffsetPayloadLen = %d, want 8", qwpHeaderOffsetPayloadLen) } } func TestQwpMaxColumnsPerTable(t *testing.T) { - // Matches QwpConstants.MAX_COLUMNS_PER_TABLE in the server. + // Mirrors Java's QwpConstantsTest testMaxColumnsPerTable. if qwpMaxColumnsPerTable != 2048 { - t.Fatalf("qwpMaxColumnsPerTable = %d, want 2048", qwpMaxColumnsPerTable) + t.Errorf("qwpMaxColumnsPerTable = %d, want 2048", qwpMaxColumnsPerTable) } } -func TestQwpTimestampEncodingFlags(t *testing.T) { - // Per-column timestamp encoding flag byte values (QWP spec §12). - if qwpTsEncodingUncompressed != 0x00 { - t.Fatalf("qwpTsEncodingUncompressed = 0x%02X, want 0x00", qwpTsEncodingUncompressed) +func TestQwpMaxBindsPerQuery(t *testing.T) { + // Pinned by spec §16 (max bind parameters per QUERY_REQUEST). + if qwpMaxBindsPerQuery != 1024 { + t.Errorf("qwpMaxBindsPerQuery = %d, want 1024", qwpMaxBindsPerQuery) } - if qwpTsEncodingGorilla != 0x01 { - t.Fatalf("qwpTsEncodingGorilla = 0x%02X, want 0x01", qwpTsEncodingGorilla) +} + +func TestQwpMaxSqlTextBytes(t *testing.T) { + // Pinned by spec §16 (max SQL text length: 1 MiB UTF-8 bytes). + if qwpMaxSqlTextBytes != 1024*1024 { + t.Errorf("qwpMaxSqlTextBytes = %d, want %d", qwpMaxSqlTextBytes, 1024*1024) } } -func TestQwpLongNullSentinel(t *testing.T) { - // Int64 MinInt64 as uint64 — used as the null sentinel for - // non-nullable LONG/TIMESTAMP/DATE/UUID/LONG256 columns. - if qwpLongNull != 0x8000000000000000 { - t.Fatalf("qwpLongNull = 0x%016X, want 0x8000000000000000", qwpLongNull) +func TestQwpMaxBatchSize(t *testing.T) { + // Pinned by spec §14 "Protocol Limits": Max batch size 16 MB. + // Mirrors Java QwpConstants.DEFAULT_MAX_BATCH_SIZE. + if qwpMaxBatchSize != 16*1024*1024 { + t.Errorf("qwpMaxBatchSize = %d, want %d", qwpMaxBatchSize, 16*1024*1024) + } +} + +func TestQwpIsFixedWidthType(t *testing.T) { + // Go has no isFixedWidth() boolean — the same information is + // encoded in qwpFixedTypeSize (>= 0 for fixed, -1 for variable). + // Mirrors Java's QwpConstantsTest testIsFixedWidthType: the + // classification is a wire-format invariant (fixed-width types + // pack into the data section without offsets, variable-width + // types carry a (nonNullCount+1)*4 offset table or a custom + // per-cell layout). + fixed := []qwpTypeCode{ + qwpTypeBoolean, qwpTypeByte, qwpTypeShort, qwpTypeChar, + qwpTypeInt, qwpTypeLong, qwpTypeFloat, qwpTypeDouble, + qwpTypeTimestamp, qwpTypeTimestampNano, qwpTypeDate, + qwpTypeUuid, qwpTypeLong256, + qwpTypeDecimal64, qwpTypeDecimal128, qwpTypeDecimal256, + qwpTypeIPv4, + } + for _, tc := range fixed { + if qwpFixedTypeSize(tc) < 0 { + t.Errorf("qwpFixedTypeSize(0x%02X) = -1; expected fixed-width type", byte(tc)) + } + } + variable := []qwpTypeCode{ + qwpTypeSymbol, qwpTypeGeohash, qwpTypeVarchar, qwpTypeBinary, + qwpTypeDoubleArray, qwpTypeLongArray, + } + for _, tc := range variable { + if qwpFixedTypeSize(tc) != -1 { + t.Errorf("qwpFixedTypeSize(0x%02X) = %d; expected -1 (variable-width)", byte(tc), qwpFixedTypeSize(tc)) + } } } diff --git a/qwp_cursor_bounds_check_fuzz_test.go b/qwp_cursor_bounds_check_fuzz_test.go new file mode 100644 index 00000000..c9fc4d64 --- /dev/null +++ b/qwp_cursor_bounds_check_fuzz_test.go @@ -0,0 +1,319 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +// Go port of QuestDB's QwpCursorBoundsCheckFuzzTest. Generates valid QWP +// egress (RESULT_BATCH) messages with random schemas/rows/types, then: +// +// - truncates them at every byte position, and +// - corrupts random bytes, +// +// asserting the decoder rejects bad input with an error and NEVER panics +// (index-out-of-range / nil-deref are the Go analogue of the Java test's +// "unexpected NPE/AIOOBE = missing bounds check"). The decoder is +// error-only by design, so a panic anywhere is a real validation gap. +// +// This test is pure and server-free (it drives (*qwpQueryDecoder).decode +// directly), so it carries no //go:build tag and runs under the normal +// `go test ./...` in build.yml as well as the qwp-fuzz workflow. +// +// Faithful divergences from the Java source: +// - The Java test builds 1-3 *tables* (ingress-style cursor); the Go +// egress decoder is single-table per RESULT_BATCH, so we generate one +// table with 1-6 columns. +// - SYMBOL is excluded: the Go egress decoder requires a +// connection-scoped delta symbol dictionary (FLAG_DELTA_SYMBOL_DICT), +// which is out of scope for a single stateless decode call and is +// already covered by the decoder hardening tests. +// - DATE is excluded: it is asymmetric on the wire (ingestion = plain +// int64; egress = timestamp-ish framing), so the ingestion encoder +// used here cannot synthesise a valid egress DATE column. Egress +// DATE decode is covered by TestQwpDecoderEgressDate. +// - We build the valid seed message with the real encoder rather than a +// hand-rolled byte writer, so "valid" is guaranteed valid for the Go +// decoder; rows are 1-19 (the 0-row/0-col degenerate frame is pinned +// separately by TestQwpDecoderHardening). + +import ( + "math" + "math/rand" + "runtime/debug" + "strconv" + "testing" +) + +const ( + boundsFuzzIterations = 50 + boundsCorruptionsPerMsg = 30 +) + +// boundsCandidateTypes is the Java FUZZABLE_TYPES set minus SYMBOL and +// DATE (see file header for why). +var boundsCandidateTypes = []qwpTypeCode{ + qwpTypeBoolean, qwpTypeByte, qwpTypeShort, qwpTypeInt, qwpTypeLong, + qwpTypeFloat, qwpTypeDouble, qwpTypeTimestamp, qwpTypeTimestampNano, + qwpTypeUuid, qwpTypeLong256, qwpTypeChar, qwpTypeVarchar, qwpTypeGeohash, + qwpTypeDecimal64, qwpTypeDecimal128, qwpTypeDecimal256, qwpTypeDoubleArray, +} + +// boundsAdderFor returns a per-column value generator for code. DECIMAL +// scale and GEOHASH precision are chosen ONCE here and captured, because +// the wire format pins them per column — the encoder rejects a row whose +// scale/precision differs from the column's established value. Everything +// else may vary freely per row. +func boundsAdderFor(t *testing.T, code qwpTypeCode, r *rand.Rand) func(*qwpColumnBuffer, *rand.Rand) { + switch code { + case qwpTypeBoolean: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addBool(r.Intn(2) == 0) } + case qwpTypeByte: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addByte(int8(r.Uint32())) } + case qwpTypeShort: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addShort(int16(r.Uint32())) } + case qwpTypeInt: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addInt32(int32(r.Uint32())) } + case qwpTypeLong: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addLong(int64(r.Uint64())) } + case qwpTypeFloat: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addFloat32(math.Float32frombits(r.Uint32())) } + case qwpTypeDouble: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addDouble(math.Float64frombits(r.Uint64())) } + case qwpTypeTimestamp, qwpTypeDate, qwpTypeTimestampNano: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addTimestamp(int64(r.Uint64())) } + case qwpTypeUuid: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addUuid(r.Uint64(), r.Uint64()) } + case qwpTypeLong256: + return func(c *qwpColumnBuffer, r *rand.Rand) { + c.addLong256(r.Uint64(), r.Uint64(), r.Uint64(), r.Uint64()) + } + case qwpTypeChar: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addChar(rune(0x20 + r.Intn(95))) } + case qwpTypeVarchar: + return func(c *qwpColumnBuffer, r *rand.Rand) { c.addString(boundsRandASCII(r)) } + case qwpTypeGeohash: + prec := int8(1 + r.Intn(60)) // fixed per column, 1-60 bits + return func(c *qwpColumnBuffer, r *rand.Rand) { + v := r.Uint64() & ((uint64(1) << uint(prec)) - 1) + if err := c.addGeohash(v, prec); err != nil { + t.Fatalf("addGeohash(prec=%d): %v", prec, err) + } + } + case qwpTypeDecimal64, qwpTypeDecimal128, qwpTypeDecimal256: + scale := uint32(r.Intn(11)) // fixed per column + return func(c *qwpColumnBuffer, r *rand.Rand) { + // <= 16 digits keeps the unscaled value inside DECIMAL64's + // 18-digit precision (and trivially 128/256); the value is + // irrelevant to a bounds fuzz, only frame validity. + u := r.Int63n(1_000_000_000_000_000) + if r.Intn(2) == 0 { + u = -u + } + if err := c.addDecimal(NewDecimalFromInt64(u, scale)); err != nil { + t.Fatalf("addDecimal(scale=%d): %v", scale, err) + } + } + case qwpTypeDoubleArray: + return func(c *qwpColumnBuffer, r *rand.Rand) { + // 1-3 elements: the Go decoder rejects a 0-length dim + // ("ARRAY dim 0 must be >= 1"), so a valid seed needs >= 1; + // the corruption/truncation passes still explore dim 0. + n := 1 + r.Intn(3) + flat := make([]float64, n) + for i := range flat { + flat[i] = math.Float64frombits(r.Uint64()) + } + c.addDoubleArray(1, []int32{int32(n)}, flat) + } + default: + t.Fatalf("boundsAdderFor: unhandled type %#x", code) + return nil + } +} + +// boundsRandASCII returns 0-19 printable-ASCII bytes (Java's +// writeStringColumnData uses 0x20 + rnd(95)). +func boundsRandASCII(r *rand.Rand) string { + n := r.Intn(20) + b := make([]byte, n) + for i := range b { + b[i] = byte(0x20 + r.Intn(95)) + } + return string(b) +} + +// genValidBoundsMessage builds a valid single-table RESULT_BATCH with +// 1-6 columns of random fuzzable types and 1-19 rows, using the real +// encoder so the frame is guaranteed valid for the Go decoder. +func genValidBoundsMessage(t *testing.T, r *rand.Rand) []byte { + t.Helper() + colCount := 1 + r.Intn(6) + rowCount := 1 + r.Intn(19) + + codes := make([]qwpTypeCode, colCount) + nullable := make([]bool, colCount) + adders := make([]func(*qwpColumnBuffer, *rand.Rand), colCount) + for i := 0; i < colCount; i++ { + codes[i] = boundsCandidateTypes[r.Intn(len(boundsCandidateTypes))] + nullable[i] = r.Intn(2) == 0 + adders[i] = boundsAdderFor(t, codes[i], r) + } + + tb := newQwpTableBuffer("t" + strconv.Itoa(r.Intn(100))) + for row := 0; row < rowCount; row++ { + for ci := 0; ci < colCount; ci++ { + col, err := tb.getOrCreateColumn("c"+strconv.Itoa(ci), codes[ci], nullable[ci]) + if err != nil { + t.Fatalf("getOrCreateColumn(c%d, type=%#x): %v", ci, codes[ci], err) + } + if nullable[ci] && r.Intn(5) == 0 { + col.addNull() + } else { + adders[ci](col, r) + } + } + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTable(tb) + return wrapAsResultBatch(ingress, 1, 0) +} + +// walkBoundsBatch exercises the parsed batch the way the Java test's +// parseAndIterate walks every row/column. It only uses each column's +// correct accessor (never a mismatched one — that would be accessor +// misuse, not a decoder bug). Variable-length accessors (Str/array) are +// the interesting ones: their offset/dim bounds logic is what a +// corrupted-but-still-decodable frame would trip. +func walkBoundsBatch(b *QwpColumnBatch) { + for col := 0; col < b.ColumnCount(); col++ { + _ = b.ColumnName(col) + _ = b.DecimalScale(col) + _ = b.GeohashPrecisionBits(col) + _ = b.NonNullCount(col) + ct := qwpTypeCode(b.ColumnType(col)) + rows := b.RowCount() + for row := 0; row < rows; row++ { + if b.IsNull(col, row) { + continue + } + switch ct { + case qwpTypeBoolean: + _ = b.Bool(col, row) + case qwpTypeByte: + _ = b.Int8(col, row) + case qwpTypeShort: + _ = b.Int16(col, row) + case qwpTypeInt, qwpTypeIPv4: + _ = b.Int32(col, row) + case qwpTypeLong, qwpTypeTimestamp, qwpTypeDate, qwpTypeTimestampNano, qwpTypeDecimal64: + _ = b.Int64(col, row) + case qwpTypeFloat: + _ = b.Float32(col, row) + case qwpTypeDouble: + _ = b.Float64(col, row) + case qwpTypeChar: + _ = b.Char(col, row) + case qwpTypeUuid: + _ = b.UuidLo(col, row) + _ = b.UuidHi(col, row) + case qwpTypeLong256: + for w := 0; w < 4; w++ { + _ = b.Long256Word(col, row, w) + } + case qwpTypeDecimal128: + _ = b.Decimal128Lo(col, row) + _ = b.Decimal128Hi(col, row) + case qwpTypeVarchar, qwpTypeBinary: + _ = b.Str(col, row) + case qwpTypeDoubleArray: + _ = b.Float64Array(col, row) + case qwpTypeLongArray: + _ = b.Int64Array(col, row) + default: + // DECIMAL256 (no scalar accessor), GEOHASH (precision + // read above), SYMBOL, etc. — IsNull above already + // touched the parsed layout. + } + } + } +} + +// decodeBoundsNoPanic runs one decode (+ full walk on success) under a +// panic guard. A returned error is fine — that is the parser correctly +// rejecting truncated/corrupt input. A panic is a missing bounds check +// and fails the test (the Go analogue of the Java test's +// `catch (Throwable t) Assert.fail`). +func decodeBoundsNoPanic(t *testing.T, payload []byte, ctx string) { + t.Helper() + defer func() { + if rec := recover(); rec != nil { + t.Fatalf("%s: decoder panicked (missing bounds check): %v\n%s", + ctx, rec, debug.Stack()) + } + }() + d := newTestQueryDecoder() + var b QwpColumnBatch + if err := d.decode(payload, &b); err == nil { + walkBoundsBatch(&b) + } +} + +func TestQwpFuzzCursorBoundsTruncation(t *testing.T) { + r := newFuzzRand(t) + for iter := 0; iter < boundsFuzzIterations; iter++ { + msg := genValidBoundsMessage(t, r) + // Sanity: the generated message must parse cleanly in full. + d := newTestQueryDecoder() + var b QwpColumnBatch + if err := d.decode(msg, &b); err != nil { + t.Fatalf("iter %d: generated message failed full parse: %v", iter, err) + } + for truncLen := 0; truncLen < len(msg); truncLen++ { + decodeBoundsNoPanic(t, msg[:truncLen], + "iter "+strconv.Itoa(iter)+" truncLen="+strconv.Itoa(truncLen)+"/"+strconv.Itoa(len(msg))) + } + } +} + +func TestQwpFuzzCursorBoundsCorruption(t *testing.T) { + r := newFuzzRand(t) + for iter := 0; iter < boundsFuzzIterations; iter++ { + msg := genValidBoundsMessage(t, r) + d := newTestQueryDecoder() + var b QwpColumnBatch + if err := d.decode(msg, &b); err != nil { + t.Fatalf("iter %d: generated message failed full parse: %v", iter, err) + } + for c := 0; c < boundsCorruptionsPerMsg; c++ { + corrupted := make([]byte, len(msg)) + copy(corrupted, msg) + nCorrupt := 1 + r.Intn(3) + for i := 0; i < nCorrupt; i++ { + corrupted[r.Intn(len(corrupted))] = byte(r.Intn(256)) + } + decodeBoundsNoPanic(t, corrupted, + "iter "+strconv.Itoa(iter)+" corruption="+strconv.Itoa(c)) + } + } +} diff --git a/qwp_egress_bench_test.go b/qwp_egress_bench_test.go new file mode 100644 index 00000000..32baf3b0 --- /dev/null +++ b/qwp_egress_bench_test.go @@ -0,0 +1,489 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +// End-to-end QWP egress (query) latency benchmarks. These are the Go +// counterparts of the Java client's two JMH latency benchmarks in the +// QuestDB OSS repo (benchmarks/src/main/java/org/questdb): +// QwpEgressLatencyBenchmark and QwpEgressBindLatencyBenchmark. +// +// The third Java egress benchmark -- the application-style, cross-protocol +// QwpEgressReadBenchmark (QWP vs PG-wire vs HTTP) -- is ported separately as +// the standalone program in bench/qwp-egress-read, not as a `go test` +// benchmark. There is deliberately no BenchmarkQwpEgressRead here: a `go +// test` benchmark would only re-measure the QWP read path the standalone +// program already covers. +// +// Unlike the rest of qwp_bench_test.go (pure encode/decode microbenchmarks +// that never touch a socket) these run against a *live* QuestDB listening on +// localhost:9000 (HTTP/WS) -- the same live-server policy as the +// TestQwpIntegration* suite. They self-skip when no server is reachable, so +// `go test -bench .` stays green on a machine without QuestDB. +// +// Go has no JMH, so the JMH SampleTime + AverageTime split maps onto: +// - ns/op -> the arithmetic mean (testing.B's native number) +// - p50/p90/p99/p999 -> custom metrics reported via b.ReportMetric, using +// the same percentile harness as the Java client's +// CursorEngineAppendLatencyBenchmark. +// +// Tunables are environment variables (the Go analog of Java's -Dkey=value), +// all read through benchEnv* helpers below: +// +// QDB_BENCH_ADDR host:port of the server (default localhost:9000) +// QDB_BENCH_SKIP_POPULATE reuse the existing table (default false) +// QDB_BENCH_SQL override the latency-bench SQL (default "SELECT 1") +// +// Examples: +// +// go test -run '^$' -bench BenchmarkQwpEgressLatency -benchtime 3000x . +// QDB_BENCH_SQL='SELECT id FROM latency_bench' \ +// go test -run '^$' -bench BenchmarkQwpEgressLatency -benchtime 2000x . +// go test -run '^$' -bench BenchmarkQwpEgressBindLatency -benchtime 3000x . + +import ( + "context" + "encoding/json" + "fmt" + "io" + "math/rand" + "net/http" + "net/url" + "os" + "sort" + "testing" + "time" +) + +// --------------------------------------------------------------------------- +// Environment knobs +// --------------------------------------------------------------------------- + +func benchEnvStr(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func benchEnvBool(key string) bool { + v := os.Getenv(key) + return v == "1" || v == "true" || v == "TRUE" || v == "yes" +} + +// benchEgressAddr is the server the benchmarks talk to. Defaults to +// localhost:9000 (the conventional dev-machine QuestDB), independent +// of the fixture-driven qwpTestAddr. Override with QDB_BENCH_ADDR. +// Bench code is not in the CI gate; this default keeps the existing +// `go test -bench` developer workflow unchanged after the integration +// tests' qwpTestAddr was switched from a const to a var. +func benchEgressAddr() string { return benchEnvStr("QDB_BENCH_ADDR", "localhost:9000") } + +// --------------------------------------------------------------------------- +// Live-server helpers (testing.B-typed; mirror the *testing.T helpers in +// qwp_integration_test.go without refactoring the shared ones). +// --------------------------------------------------------------------------- + +// benchSkipIfNoServer skips the benchmark when no QuestDB egress endpoint is +// reachable. Same intent as qwpEnsureServer, but it dials the actual egress +// path (the read socket) so a server with only ingest wired up still skips +// cleanly rather than failing deep in @Setup-equivalent code. +func benchSkipIfNoServer(b *testing.B) { + b.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, WithQwpQueryAddress(benchEgressAddr())) + if err != nil { + b.Skipf("QuestDB egress not available at %s: %v", benchEgressAddr(), err) + } + _ = c.Close(ctx) +} + +// benchHTTPExec runs a statement through the server's HTTP /exec endpoint and +// returns the parsed result. Used for table setup/teardown and the WAL-apply +// poll -- deliberately off the QWP wire so it never perturbs the path under +// measurement (the same separation the Java benches get from using PG-wire). +func benchHTTPExec(b *testing.B, statement string) qwpTableResult { + b.Helper() + u, _ := url.Parse("http://" + benchEgressAddr()) + u.Path = "/exec" + params := url.Values{} + params.Add("query", statement) + u.RawQuery = params.Encode() + + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, u.String(), nil) + if err != nil { + b.Fatalf("build /exec request: %v", err) + } + resp, err := qwpTestHTTPClient.Do(req) + if err != nil { + b.Fatalf("/exec %q failed: %v", statement, err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + b.Fatalf("/exec %q: HTTP %d", statement, resp.StatusCode) + } + body, err := io.ReadAll(resp.Body) + if err != nil { + b.Fatalf("/exec %q: read body: %v", statement, err) + } + var result qwpTableResult + if err := json.Unmarshal(body, &result); err != nil { + b.Fatalf("/exec %q: decode: %v (body: %s)", statement, err, string(body)) + } + return result +} + +// jsonNumToInt64 extracts an integer from a generic-decoded JSON cell. The +// /exec endpoint emits numbers, which encoding/json unmarshals into float64 +// when the target is interface{}. +func jsonNumToInt64(v interface{}) (int64, bool) { + switch n := v.(type) { + case float64: + return int64(n), true + case json.Number: + i, err := n.Int64() + return i, err == nil + default: + return 0, false + } +} + +// benchWaitTimeout is how long benchWaitForRows waits for asynchronous WAL +// apply to catch up after the seed Flush returns. QDB_BENCH_WAIT (a Go +// duration, e.g. "30m") overrides it; the default scales with row count +// because server-side apply is the slow part for large seeds. The timeout is +// only the give-up point -- the poll returns the instant the count matches -- +// so a generous ceiling costs nothing on a healthy server. +func benchWaitTimeout(b *testing.B, rows int) time.Duration { + if v := os.Getenv("QDB_BENCH_WAIT"); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + b.Fatalf("QDB_BENCH_WAIT=%q: %v", v, err) + } + return d + } + // 5m floor + ~1s per 100k rows (assumes >=100k rows/s end-to-end apply, + // comfortably conservative). 100M rows -> ~22m ceiling. + return 5*time.Minute + time.Duration(rows/100_000)*time.Second +} + +// benchWaitForRows polls until table holds exactly want rows (WAL apply is +// asynchronous; ingest Flush returning does not mean the rows are queryable). +// Logs progress periodically so a multi-minute large-seed apply is observable +// under `go test -v -bench`. +func benchWaitForRows(b *testing.B, table string, want int) { + b.Helper() + timeout := benchWaitTimeout(b, want) + deadline := time.Now().Add(timeout) + lastLog := time.Now() + for time.Now().Before(deadline) { + res := benchHTTPExec(b, fmt.Sprintf("SELECT count() FROM '%s'", table)) + if len(res.Dataset) == 1 && len(res.Dataset[0]) == 1 { + got, ok := jsonNumToInt64(res.Dataset[0][0]) + if ok && got == int64(want) { + return + } + if ok && time.Since(lastLog) >= 15*time.Second { + b.Logf("WAL apply: %d / %d rows", got, want) + lastLog = time.Now() + } + } + time.Sleep(500 * time.Millisecond) + } + b.Fatalf("timed out after %s waiting for %d rows in %q (override with QDB_BENCH_WAIT)", + timeout, want, table) +} + +// benchTableCount returns table's row count, or -1 if the table is absent or +// the count can't be read (so callers treat "unknown" as "needs populating"). +// Unlike benchHTTPExec it never fails the benchmark -- a missing table is the +// expected pre-seed state, and /exec answers a missing table with HTTP 400. +func benchTableCount(table string) int64 { + u, _ := url.Parse("http://" + benchEgressAddr()) + u.Path = "/exec" + params := url.Values{} + params.Add("query", fmt.Sprintf("SELECT count() FROM '%s'", table)) + u.RawQuery = params.Encode() + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, u.String(), nil) + if err != nil { + return -1 + } + resp, err := qwpTestHTTPClient.Do(req) + if err != nil { + return -1 + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return -1 + } + body, err := io.ReadAll(resp.Body) + if err != nil { + return -1 + } + var res qwpTableResult + if err := json.Unmarshal(body, &res); err != nil { + return -1 + } + if len(res.Dataset) != 1 || len(res.Dataset[0]) != 1 { + return -1 + } + if n, ok := jsonNumToInt64(res.Dataset[0][0]); ok { + return n + } + return -1 +} + +// benchEnsurePopulated runs populate() to (re)create and seed `table`, then +// waits for WAL apply -- unless the work can be safely skipped, in which case +// it returns fast. It is skipped when QDB_BENCH_SKIP_POPULATE is set, or when +// `table` already holds exactly wantRows rows. +// +// The row-count short-circuit is load-bearing, not just an optimization: +// `go test` invokes a benchmark body once at b.N=1 (the launch/estimate pass) +// and again at the real -benchtime N. Setup that lives in the body would +// otherwise run on every invocation -- which at QDB_BENCH_ROWS=100000000 means +// seeding 100M rows twice. The first pass seeds; the second sees the matching +// count and skips. It also makes re-runs against an existing table instant. +func benchEnsurePopulated(b *testing.B, table string, wantRows int, populate func()) { + b.Helper() + if benchEnvBool("QDB_BENCH_SKIP_POPULATE") { + b.Logf("QDB_BENCH_SKIP_POPULATE set, reusing existing %s", table) + return + } + if n := benchTableCount(table); n == int64(wantRows) { + b.Logf("%s already holds %d rows, skipping populate "+ + "(DROP it or change QDB_BENCH_ROWS to force a reseed)", table, wantRows) + return + } + populate() + benchWaitForRows(b, table, wantRows) +} + +// --------------------------------------------------------------------------- +// Latency percentile harness (shared by the two latency benchmarks) +// --------------------------------------------------------------------------- + +// runQueryLatency drives `b.N` single-query round-trips through queryOnce, +// recording per-call wall time, and reports p50/p90/p99/p99.9 alongside the +// native ns/op mean. queryOnce must submit one query, drain it fully, and +// return any error -- exactly the work whose latency we attribute. +// +// This is the symmetric counterpart of the ingress side's per-row +// .At()+Flush() loop, and mirrors QwpEgressLatencyBenchmark: the client is +// opened once by the caller and reused across every measured invocation; +// table/connection setup is outside the timed region. +func runQueryLatency(b *testing.B, queryOnce func() error) { + samples := make([]time.Duration, b.N) + b.ResetTimer() + for i := 0; i < b.N; i++ { + t0 := time.Now() + if err := queryOnce(); err != nil { + b.Fatalf("query %d: %v", i, err) + } + samples[i] = time.Since(t0) + } + b.StopTimer() + reportLatencyPercentiles(b, samples) +} + +func reportLatencyPercentiles(b *testing.B, samples []time.Duration) { + if len(samples) == 0 { + return + } + sort.Slice(samples, func(i, j int) bool { return samples[i] < samples[j] }) + n := len(samples) + pick := func(p float64) float64 { + idx := int(float64(n-1) * p) + if idx > n-1 { + idx = n - 1 + } + return float64(samples[idx].Nanoseconds()) / 1e3 // -> microseconds + } + // Distinct unit strings so benchstat treats each as its own metric. + // The ".0"/".9" suffixes keep them lexicographically ordered in + // `go test` output (p50.0 < p90.0 < p99.0 < p99.9). + b.ReportMetric(pick(0.50), "p50.0us/op") + b.ReportMetric(pick(0.90), "p90.0us/op") + b.ReportMetric(pick(0.99), "p99.0us/op") + b.ReportMetric(pick(0.999), "p99.9us/op") +} + +// --------------------------------------------------------------------------- +// BenchmarkQwpEgressLatency -- Go counterpart of QwpEgressLatencyBenchmark +// --------------------------------------------------------------------------- + +// BenchmarkQwpEgressLatency measures the end-to-end wall time of a single +// query round-trip over QWP/WebSocket against a live local QuestDB, with the +// QwpQueryClient opened once and reused (connection setup excluded). +// +// Default SQL is "SELECT 1" -- no storage/cursor cost, so the number is the +// parse + protocol round-trip floor. Set QDB_BENCH_SQL to anything else (e.g. +// "SELECT id FROM latency_bench") to fold in storage and cursor cost; the +// latency_bench table is created and seeded with one row in setup so the +// default override works out of the box. QDB_BENCH_SKIP_POPULATE=1 reuses the +// existing table instead of dropping/recreating it. +func BenchmarkQwpEgressLatency(b *testing.B) { + benchSkipIfNoServer(b) + + const table = "latency_bench" + benchEnsurePopulated(b, table, 1, func() { + benchHTTPExec(b, "DROP TABLE IF EXISTS '"+table+"'") + benchHTTPExec(b, "CREATE TABLE '"+table+"' (id LONG, ts TIMESTAMP) "+ + "TIMESTAMP(ts) PARTITION BY DAY WAL") + seedRows(b, table, 1, func(s LineSender, i int) error { + return s.Table(table).Int64Column("id", 1). + At(context.Background(), time.Unix(0, 0).UTC()) + }) + }) + + sql := benchEnvStr("QDB_BENCH_SQL", "SELECT 1") + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + client, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(benchEgressAddr()), + WithQwpQueryClientID("qwp-egress-bench-go/1.0"), + ) + if err != nil { + b.Fatalf("NewQwpQueryClient: %v", err) + } + defer client.Close(ctx) + + queryOnce := func() error { + q := client.Query(ctx, sql) + _, _, err := drainQuery(q) + q.Close() + return err + } + + // Prime: first query allocates the client's codec scratch and registers + // the result schema. Keeps that one-time cost out of the window, exactly + // like the Java benchmark's throwaway @Setup query. + if err := queryOnce(); err != nil { + b.Fatalf("prime query: %v", err) + } + + runQueryLatency(b, queryOnce) +} + +// --------------------------------------------------------------------------- +// BenchmarkQwpEgressBindLatency -- Go counterpart of +// QwpEgressBindLatencyBenchmark +// --------------------------------------------------------------------------- + +// BenchmarkQwpEgressBindLatency measures the same single-query round-trip but +// with a bind-variable query: SELECT x FROM long_sequence(10) WHERE x = $1, +// where $1 is a random LONG in [1,10] per call. The value randomizes but the +// bind TYPE does not, so the server's select cache should hit every call +// after the first. Comparing this against BenchmarkQwpEgressLatency running +// the literal "SELECT 1" isolates bind encode/decode + cache-lookup overhead. +// +// long_sequence(10) is the row source, so this benchmark needs no table and +// no WAL-apply wait. +func BenchmarkQwpEgressBindLatency(b *testing.B) { + benchSkipIfNoServer(b) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + client, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(benchEgressAddr()), + WithQwpQueryClientID("qwp-egress-bind-bench-go/1.0"), + ) + if err != nil { + b.Fatalf("NewQwpQueryClient: %v", err) + } + defer client.Close(ctx) + + const sql = "SELECT x FROM long_sequence(10) WHERE x = $1" + rng := rand.New(rand.NewSource(1)) // deterministic value stream + + queryOnce := func() error { + v := int64(rng.Intn(10) + 1) + q := client.Query(ctx, sql, WithQwpQueryBinds(func(bv *QwpBinds) { + bv.LongBind(0, v) + })) + _, _, err := drainQuery(q) + q.Close() + return err + } + + if err := queryOnce(); err != nil { + b.Fatalf("prime query: %v", err) + } + runQueryLatency(b, queryOnce) +} + +// --------------------------------------------------------------------------- +// Shared low-level helpers +// --------------------------------------------------------------------------- + +// drainQuery consumes every batch of q, doing no per-row work -- the egress +// equivalent of QwpEgressLatencyBenchmark's deliberately empty batch handler. +// Returns rows seen and total batch-payload bytes. +func drainQuery(q *QwpQuery) (rows int, bytes int64, err error) { + for batch, e := range q.Batches() { + if e != nil { + return rows, bytes, e + } + rows += batch.RowCount() + bytes += int64(len(batch.Payload())) + } + return rows, bytes, nil +} + +// seedRows ingests `n` rows into `table` over a fresh public QWP LineSender +// (ws://, auto-flush every 50k rows -- same shape as the Java read bench's +// Sender.fromConfig). rowFn fills one row; it must call At/AtNow itself so +// the caller controls the designated timestamp. +func seedRows(b *testing.B, table string, n int, rowFn func(s LineSender, i int) error) { + b.Helper() + ctx := context.Background() + conf := fmt.Sprintf("ws::addr=%s;auto_flush_rows=50000;", benchEgressAddr()) + s, err := LineSenderFromConf(ctx, conf) + if err != nil { + b.Fatalf("LineSenderFromConf(%q): %v", conf, err) + } + defer s.Close(ctx) + start := time.Now() + lastLog := start + for i := 0; i < n; i++ { + if err := rowFn(s, i); err != nil { + b.Fatalf("seed row %d: %v", i, err) + } + // Progress for large seeds: a 100M-row ingest is several minutes + // of otherwise-silent work. Matches the Java benches' per-1M log. + if n >= 1_000_000 && (i+1)%1_000_000 == 0 && time.Since(lastLog) >= 10*time.Second { + elapsed := time.Since(start).Seconds() + b.Logf("seeded %d / %d rows (%.0f rows/s)", i+1, n, float64(i+1)/elapsed) + lastLog = time.Now() + } + } + if err := s.Flush(ctx); err != nil { + b.Fatalf("seed flush: %v", err) + } + if n >= 1_000_000 { + b.Logf("seeded %d rows in %s, waiting for WAL apply...", n, time.Since(start).Round(time.Second)) + } +} diff --git a/qwp_egress_bind_fuzz_test.go b/qwp_egress_bind_fuzz_test.go new file mode 100644 index 00000000..b703d6e1 --- /dev/null +++ b/qwp_egress_bind_fuzz_test.go @@ -0,0 +1,306 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpEgressBindFuzzTest. Property-based fuzz for the +// client-side bind encoder: each iteration picks random scalar bind +// values, runs SELECT $n::TYPE through the QWP query client, and asserts +// the round-trip value per cell. Complements the hand-picked boundary +// vectors in qwp_bind_values_test.go by stressing the encoder with +// arbitrary random inputs that catch bit-level encoding bugs. +// +// Reproducibility: every sub-test logs its master seed. Re-run a failing +// case with QWP_FUZZ_SEED= go test -run . + +import ( + "context" + "math" + "math/rand" + "strconv" + "strings" + "testing" + "time" +) + +const bindFuzzIterations = 25 + +// newFuzzRand lives in qwp_fuzz_seed_test.go (shared, build-tag-free). + +func pickNonNullLong(r *rand.Rand) int64 { + for { + v := int64(r.Uint64()) + if v != math.MinInt64 { // QuestDB LONG null sentinel + return v + } + } +} + +func pickNonNullInt(r *rand.Rand) int32 { + for { + v := int32(r.Uint32()) + if v != math.MinInt32 { // QuestDB INT null sentinel + return v + } + } +} + +// pickSpecialOrRandomDouble mirrors the Java helper: small odds of a +// special value, otherwise a random finite double. ±Inf and -0.0 are +// skipped because QuestDB's ::DOUBLE cast normalises them, which would +// make a raw round-trip comparison flap for reasons unrelated to the +// bind encoder. +func pickSpecialOrRandomDouble(r *rand.Rand) float64 { + switch r.Intn(4) { + case 0: + return math.NaN() + case 1: + return 0.0 + default: + for { + d := math.Float64frombits(r.Uint64()) + if !math.IsInf(d, 0) { + return d + } + } + } +} + +// queryOneRow runs sql with the given binds and invokes read on the +// single result batch. Fails the test (with iteration context) on a +// transport/query error, matching the Java onError → Assert.fail path. +func queryOneRow(t *testing.T, c *QwpQueryClient, sql, ctxMsg string, binds QwpBindFunc, read func(b *QwpColumnBatch)) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + q := c.Query(ctx, sql, WithQwpQueryBinds(binds)) + defer q.Close() + seen := false + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("%s: query error: %v", ctxMsg, err) + } + if batch.RowCount() > 0 && !seen { + seen = true + read(batch) + } + } + if !seen { + t.Fatalf("%s: query returned no rows", ctxMsg) + } +} + +func newBindFuzzClient(t *testing.T, srv *qwpFuzzServer) *QwpQueryClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := QwpQueryClientFromConf(ctx, srv.connConf()) + if err != nil { + t.Fatalf("QwpQueryClientFromConf(%q): %v", srv.connConf(), err) + } + t.Cleanup(func() { + cctx, ccancel := context.WithTimeout(context.Background(), 5*time.Second) + defer ccancel() + _ = c.Close(cctx) + }) + return c +} + +func TestQwpFuzzDoubleBinds(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + c := newBindFuzzClient(t, srv) + + for i := 0; i < bindFuzzIterations; i++ { + d := pickSpecialOrRandomDouble(r) + var got float64 + var gotNull bool + queryOneRow(t, c, + "SELECT $1::DOUBLE AS d FROM long_sequence(1)", + "iter "+strconv.Itoa(i), + func(b *QwpBinds) { b.DoubleBind(0, d) }, + func(b *QwpColumnBatch) { + gotNull = b.IsNull(0, 0) + got = b.Float64(0, 0) + }, + ) + if math.IsNaN(d) { + // QuestDB's ::DOUBLE cast maps the NaN bit pattern to its + // DOUBLE NULL sentinel. Java surfaces that as NaN; the Go + // batch API deliberately returns 0 for NULL rows (see the + // Float64/Float32 doc comments) and exposes the null via + // IsNull. Either signal is a correct round-trip of a bound + // NaN — the bind encoder did its job. + if !gotNull && !math.IsNaN(got) { + t.Fatalf("iter %d: bound NaN, expected NULL/NaN, got %v (null=%v)", i, got, gotNull) + } + continue + } + if gotNull { + t.Fatalf("iter %d: d=%v came back NULL", i, d) + } + // Go == treats -0.0 == 0.0 as equal, matching QuestDB's cast + // normalisation; Inf was excluded by the generator. + if got != d { + t.Fatalf("iter %d: d=%v (bits=%#x) got=%v (bits=%#x)", + i, d, math.Float64bits(d), got, math.Float64bits(got)) + } + } +} + +func TestQwpFuzzIntegralBindsProjection(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + c := newBindFuzzClient(t, srv) + + for i := 0; i < bindFuzzIterations; i++ { + longVal := pickNonNullLong(r) + intVal := pickNonNullInt(r) + shortVal := int16(r.Uint32()) + byteVal := int8(r.Uint32()) + boolVal := r.Intn(2) == 0 + + var gotLong int64 + var gotInt int32 + var gotShort int16 + var gotByte int8 + var gotBool bool + queryOneRow(t, c, + "SELECT $1::LONG AS l, $2::INT AS i, $3::SHORT AS s, $4::BYTE AS b, $5::BOOLEAN AS x FROM long_sequence(1)", + "iter "+strconv.Itoa(i), + func(b *QwpBinds) { + b.LongBind(0, longVal). + IntBind(1, intVal). + ShortBind(2, shortVal). + ByteBind(3, byteVal). + BooleanBind(4, boolVal) + }, + func(b *QwpColumnBatch) { + gotLong = b.Int64(0, 0) + gotInt = b.Int32(1, 0) + gotShort = b.Int16(2, 0) + gotByte = b.Int8(3, 0) + gotBool = b.Bool(4, 0) + }, + ) + if gotLong != longVal { + t.Fatalf("iter %d long: want %d got %d", i, longVal, gotLong) + } + if gotInt != intVal { + t.Fatalf("iter %d int: want %d got %d", i, intVal, gotInt) + } + if gotShort != shortVal { + t.Fatalf("iter %d short: want %d got %d", i, shortVal, gotShort) + } + if gotByte != byteVal { + t.Fatalf("iter %d byte: want %d got %d", i, byteVal, gotByte) + } + if gotBool != boolVal { + t.Fatalf("iter %d bool: want %v got %v", i, boolVal, gotBool) + } + } +} + +func TestQwpFuzzUuidBinds(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + c := newBindFuzzClient(t, srv) + + for i := 0; i < bindFuzzIterations; i++ { + lo := pickNonNullLong(r) + hi := pickNonNullLong(r) + var gotLo, gotHi int64 + queryOneRow(t, c, + "SELECT $1::UUID AS u FROM long_sequence(1)", + "iter "+strconv.Itoa(i), + // Go's UuidBind takes (hi, lo); the Java test's + // setUuid(0, lo, hi) is the same logical UUID. + func(b *QwpBinds) { b.UuidBind(0, uint64(hi), uint64(lo)) }, + func(b *QwpColumnBatch) { + gotLo = b.UuidLo(0, 0) + gotHi = b.UuidHi(0, 0) + }, + ) + if gotLo != lo { + t.Fatalf("iter %d uuid lo: want %d got %d", i, lo, gotLo) + } + if gotHi != hi { + t.Fatalf("iter %d uuid hi: want %d got %d", i, hi, gotHi) + } + } +} + +// TestQwpFuzzSameSqlDifferentBindsCacheReuse stresses the +// same-SQL-different-binds path that the server's factory cache is meant +// to accelerate. Random integer lookups, 50 iterations. +func TestQwpFuzzSameSqlDifferentBindsCacheReuse(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + + const table = "qwp_fuzz_bind_cache" + srv.mustExec(t, "DROP TABLE IF EXISTS '"+table+"'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+table+"'") + srv.mustExec(t, "CREATE TABLE "+table+"(id LONG, v LONG, part_ts TIMESTAMP) TIMESTAMP(part_ts) PARTITION BY DAY WAL") + + const rows = 100 + var insert strings.Builder + insert.WriteString("INSERT INTO " + table + " VALUES ") + for rr := 0; rr < rows; rr++ { + if rr > 0 { + insert.WriteString(", ") + } + insert.WriteString("(") + insert.WriteString(strconv.Itoa(rr)) + insert.WriteString(", ") + insert.WriteString(strconv.FormatInt(int64(rr)*7, 10)) + insert.WriteString(", ") + insert.WriteString(strconv.Itoa(rr + 1)) + insert.WriteString("::TIMESTAMP)") + } + srv.mustExec(t, insert.String()) + srv.awaitRows(t, table, rows, 30*time.Second) + + c := newBindFuzzClient(t, srv) + const sql = "SELECT v FROM " + table + " WHERE id = $1" + for i := 0; i < 50; i++ { + target := r.Intn(rows) + var observed int64 = -1 + var rowCount int + queryOneRow(t, c, sql, "iter "+strconv.Itoa(i)+" target="+strconv.Itoa(target), + func(b *QwpBinds) { b.IntBind(0, int32(target)) }, + func(b *QwpColumnBatch) { + rowCount = b.RowCount() + observed = b.Int64(0, 0) + }, + ) + if rowCount != 1 { + t.Fatalf("iter %d target=%d: want 1 row, got %d", i, target, rowCount) + } + if want := int64(target) * 7; observed != want { + t.Fatalf("iter %d target=%d: want v=%d got %d", i, target, want, observed) + } + } +} diff --git a/qwp_egress_fragmentation_fuzz_test.go b/qwp_egress_fragmentation_fuzz_test.go new file mode 100644 index 00000000..f143c5b2 --- /dev/null +++ b/qwp_egress_fragmentation_fuzz_test.go @@ -0,0 +1,236 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpEgressFragmentationFuzzTest. Stress the QWP +// egress state machines under artificial network fragmentation: a +// sidecar server is booted with BOTH the recv- and send-side debug +// chunk-size knobs forced to a tiny value, so every wire frame spans +// many partial socket reads/writes and the server's frame parser, +// HTTP response sink, and egress streamResults loop must survive being +// preempted / resumed at arbitrary byte boundaries. +// +// The Java property keys map to env vars via the QDB_* prefix: +// debug.http.force.recv.fragmentation.chunk.size -> +// QDB_DEBUG_HTTP_FORCE_RECV_FRAGMENTATION_CHUNK_SIZE +// debug.http.force.send.fragmentation.chunk.size -> +// QDB_DEBUG_HTTP_FORCE_SEND_FRAGMENTATION_CHUNK_SIZE +// +// The client side is plain QwpQueryClient — the property under test +// is server-side handling of micro-chunked wire bytes. The client +// only needs longer-than-default deadlines because tiny chunks slow +// the handshake / drain dramatically. +// +// All tests require fixture-launched mode (sidecar JVM with env +// overrides); skip in QDB_FUZZ_ADDR mode. + +import ( + "context" + "strconv" + "testing" + "time" +) + +// bootEgressFragmentedServer boots a sidecar QuestDB with both +// fragmentation chunk-size knobs forced to chunk. The smaller chunk +// is, the more aggressive the wire fragmentation: chunk=1 makes every +// byte its own socket-level event (including the WebSocket handshake +// response, every WS frame header, every QWP prelude, every CREDIT +// frame body). +func bootEgressFragmentedServer(t *testing.T, chunk int) *qwpFuzzServer { + t.Helper() + return bootSidecarServer(t, map[string]string{ + "QDB_DEBUG_HTTP_FORCE_RECV_FRAGMENTATION_CHUNK_SIZE": strconv.Itoa(chunk), + "QDB_DEBUG_HTTP_FORCE_SEND_FRAGMENTATION_CHUNK_SIZE": strconv.Itoa(chunk), + }) +} + +// fragFuzzPickChunk mirrors Java QwpEgressFragmentationFuzzTest.pickChunk: +// 1..500-byte chunk. The mode is "be aggressive enough that even tiny +// wire frames span many iterations and the state machine must survive +// preemption at arbitrary points". +func fragFuzzPickChunk(r interface { + Intn(int) int +}) int { + return 1 + r.Intn(500) +} + +// fragFuzzRunAndVerify runs `SELECT * FROM ` against the +// fragmented server and verifies rowCount + sum(id). The id sum +// expectation (n*(n+1)/2) follows from QuestDB's long_sequence(n) +// producing 1..n in id. Mirrors Java's runAndVerify. +// +// Uses a long context so the handshake/drain has room — at chunk=1 +// the entire wire path is single-byte socket events and even a small +// result set takes seconds. +func fragFuzzRunAndVerify(t *testing.T, c *QwpQueryClient, table string, expectedRows int) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT * FROM '"+table+"'") + defer q.Close() + var idSum int64 + rows := 0 + idCol := -1 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("table %q query error: %v", table, err) + } + if idCol < 0 { + for i := 0; i < batch.ColumnCount(); i++ { + if batch.ColumnName(i) == "id" { + idCol = i + break + } + } + if idCol < 0 { + t.Fatalf("table %q: no 'id' column in result (cols: %d)", table, batch.ColumnCount()) + } + } + for r := 0; r < batch.RowCount(); r++ { + if batch.IsNull(idCol, r) { + t.Fatalf("table %q row %d: id is NULL — wire fragmentation lost a value", table, rows+r) + } + idSum += batch.Int64(idCol, r) + } + rows += batch.RowCount() + } + if rows != expectedRows { + t.Fatalf("table %q: got %d rows, expected %d", table, rows, expectedRows) + } + wantSum := int64(expectedRows) * int64(expectedRows+1) / 2 + if idSum != wantSum { + t.Fatalf("table %q: id sum %d != expected %d (rowCount matches but values drifted)", + table, idSum, wantSum) + } +} + +// fragFuzzNewClient opens a QwpQueryClient against the sidecar with +// optional extra connect-string options (e.g. initial_credit). The +// sidecar's connConf gives the bare address; the caller appends. +func fragFuzzNewClient(t *testing.T, srv *qwpFuzzServer, extra string) *QwpQueryClient { + t.Helper() + conf := srv.connConf() + extra + // Generous connect timeout — at chunk=1 the WS handshake alone takes + // hundreds of socket events to complete. + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + c, err := QwpQueryClientFromConf(ctx, conf) + if err != nil { + t.Fatalf("QwpQueryClientFromConf(%q): %v", conf, err) + } + t.Cleanup(func() { + cctx, ccancel := context.WithTimeout(context.Background(), 60*time.Second) + defer ccancel() + _ = c.Close(cctx) + }) + return c +} + +// --- entry points ------------------------------------------------- + +// TestQwpFuzzEgressFragmentedBackToBackQueries — port of Java +// testFragmentedBackToBackQueries. Five sequential queries on the +// same connection — shakes out cross-query state that might have +// picked up residue from a fragmented prior query. +func TestQwpFuzzEgressFragmentedBackToBackQueries(t *testing.T) { + r := newFuzzRand(t) + chunk := fragFuzzPickChunk(r) + t.Logf("chunk=%d", chunk) + + srv := bootEgressFragmentedServer(t, chunk) + srv.mustExec(t, "CREATE TABLE btb(id LONG, v DOUBLE, ts TIMESTAMP) "+ + "TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.mustExec(t, "INSERT INTO btb SELECT x, CAST(x * 2.5 AS DOUBLE), x::TIMESTAMP "+ + "FROM long_sequence(8000)") + srv.awaitRows(t, "btb", 8000, 60*time.Second) + + c := fragFuzzNewClient(t, srv, "") + for q := 0; q < 5; q++ { + fragFuzzRunAndVerify(t, c, "btb", 8000) + } +} + +// TestQwpFuzzEgressFragmentedCreditFlow — port of Java +// testFragmentedCreditFlow. Small initial credit (2 KiB) forces the +// server to interleave RESULT_BATCH bytes with CREDIT frames from the +// client; both directions are chunked so the server's recv-side +// parser must stitch CREDIT bodies split across multiple partial +// reads. +func TestQwpFuzzEgressFragmentedCreditFlow(t *testing.T) { + r := newFuzzRand(t) + chunk := fragFuzzPickChunk(r) + t.Logf("chunk=%d", chunk) + + srv := bootEgressFragmentedServer(t, chunk) + srv.mustExec(t, + "CREATE TABLE cf AS (SELECT x AS id, x::TIMESTAMP AS ts FROM long_sequence(20000)) "+ + "TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.awaitRows(t, "cf", 20_000, 60*time.Second) + + c := fragFuzzNewClient(t, srv, "initial_credit=2048;") + fragFuzzRunAndVerify(t, c, "cf", 20_000) +} + +// TestQwpFuzzEgressFragmentedStreamingBigResult — port of Java +// testFragmentedStreamingBigResult. 50K rows over a chunked wire — +// stresses the egress streamResults loop's long-running drain path. +func TestQwpFuzzEgressFragmentedStreamingBigResult(t *testing.T) { + r := newFuzzRand(t) + chunk := fragFuzzPickChunk(r) + t.Logf("chunk=%d", chunk) + + srv := bootEgressFragmentedServer(t, chunk) + srv.mustExec(t, + "CREATE TABLE bigt AS ("+ + "SELECT x AS id, CAST(x * 1.5 AS DOUBLE) AS v, "+ + "CAST('s_' || (x % 100) AS SYMBOL) AS s, "+ + "x::TIMESTAMP AS ts "+ + "FROM long_sequence(50000)) TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.awaitRows(t, "bigt", 50_000, 90*time.Second) + + c := fragFuzzNewClient(t, srv, "") + fragFuzzRunAndVerify(t, c, "bigt", 50_000) +} + +// TestQwpFuzzEgressHandshakeSurvivesMicroChunk — port of Java +// testHandshakeSurvivesMicroChunk. Pin chunk to 5 bytes: the ~220 B +// WebSocket 101 handshake response fragments across ~44 socket writes, +// forcing rawSocket.send() to park repeatedly. Regression for the +// "Egress 101 handshake blocked" bug that surfaced when any chunk was +// smaller than the handshake response. +func TestQwpFuzzEgressHandshakeSurvivesMicroChunk(t *testing.T) { + const chunk = 5 + srv := bootEgressFragmentedServer(t, chunk) + srv.mustExec(t, "CREATE TABLE tiny(id LONG, ts TIMESTAMP) "+ + "TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.mustExec(t, "INSERT INTO tiny SELECT x, x::TIMESTAMP FROM long_sequence(3)") + srv.awaitRows(t, "tiny", 3, 60*time.Second) + + c := fragFuzzNewClient(t, srv, "") + fragFuzzRunAndVerify(t, c, "tiny", 3) +} diff --git a/qwp_egress_fuzz_test.go b/qwp_egress_fuzz_test.go new file mode 100644 index 00000000..0c1b235b --- /dev/null +++ b/qwp_egress_fuzz_test.go @@ -0,0 +1,1296 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpEgressFuzzTest. Property-based fuzz coverage +// for QWP egress: each case builds a random schema (1-16 columns drawn +// from a catalogue covering every QWP wire type the server ships), +// rolls per-cell random values in Go so the expected (row, col) value +// is known before the query runs, inserts them as literal rows, picks a +// random query shape (full scan / projection reorder / id-range filter / +// reverse-order limit), streams the result over QWP, and asserts per +// row, per cell that the observed value matches the stored expectation. +// Row-by-row verification catches bugs a per-column sum hides: row +// reordering within a batch, cross-batch boundary misalignment, +// null-bitmap bit swaps, partial varint reads. +// +// Faithful-port divergences from the Java source (cf. the bind / bounds +// ports' headers): +// +// - No network fragmentation. Java rotates a server debug env var +// (DEBUG_HTTP_FORCE_{RECV,SEND}_FRAGMENTATION_CHUNK_SIZE) per @Test +// via startFragmented(chunk). The Go fixture is one shared, +// long-lived server (sync.Once); per-test server env can't vary. +// Server-side fragmentation is exhaustively covered by the server +// repo's QwpEgressFragmentationFuzzTest and is a transport concern +// orthogonal to the Go decoder's per-cell correctness, which is +// what this port validates. All cases run against the shared +// unfragmented server. +// - No compression rotation. The Go QWP connection string exposes no +// compression key (conf_parse.go has none), so Java's +// pickCompression() fragment has nothing to port. +// - Chunked INSERT. Java emits one giant INSERT ... VALUES; the Go +// fixture's /exec is a GET, so the rows are split into length- +// budgeted sub-INSERTs. Identical data, transport detail only. +// - GEOHASH is existence-only. The Go batch surface has no geohash +// scalar accessor (only GeohashPrecisionBits); the existence +// guarantee is "the frame decoded and the null bitmap is correct" +// (null cells assert IsNull, non-null assert !IsNull) — the same +// intent as Java's discard-the-value getGeohashValue() call. +// BINARY / DECIMAL128 / DECIMAL256 / DOUBLE[] keep Java's existing +// existence-only treatment (encoding not re-implemented client +// side); DECIMAL64 is bit-verified via its scaled int64. +// - Reproducibility via QWP_FUZZ_SEED (shared newFuzzRand); the Go +// RNG sequence need not match Java's Rnd bit-for-bit, only be +// Go-internally reproducible. + +import ( + "context" + "fmt" + "math" + "math/rand" + "strconv" + "strings" + "testing" + "time" +) + +const egressFuzzMaxRowsPerCase = 500 + +// egInsertChunkBudget caps the character length of a single generated +// INSERT statement so the fixture's GET /exec request line stays well +// under any server header-buffer default. A worst-case row (16 wide +// columns, ~70-char LONG256 literals) is ~1.2 KB, comfortably below +// this, so no single row ever overflows a chunk. +const egInsertChunkBudget = 6000 + +// --- value generators ------------------------------------------------- +// +// Mirrors the Java ColumnGenerator catalogue 1:1. randomValue fills a +// SQL literal safely usable inside VALUES(...) plus a deterministic +// int64 hash that must equal observedHash after a faithful QWP round +// trip. supportsNull is false for types QuestDB coerces NULL into a +// zero value (BOOLEAN / BYTE / SHORT / CHAR) and for BINARY (sourced +// via rnd_bin, no NULL literal path). + +type egRandomValue struct { + hash int64 + literal string +} + +type egColumnGenerator interface { + observedHash(b *QwpColumnBatch, col, row int) int64 + randomValue(r *rand.Rand, out *egRandomValue) + sqlType() string + supportsNull() bool +} + +// egGenerators is the catalogue, in the exact order of the Java +// GENERATORS array. +var egGenerators = []egColumnGenerator{ + egLongGen{}, + egIntGen{}, + egShortGen{}, + egByteGen{}, + egCharGen{}, + egDoubleGen{}, + egFloatGen{}, + egBooleanGen{}, + newEgSymbolGen("lo", 8), + newEgSymbolGen("hi", 1000), + egVarcharGen{}, + egStringGen{}, + egTimestampGen{}, + egTimestampNanosGen{}, + egDateGen{}, + egIpv4Gen{}, + egUuidGen{}, + egLong256Gen{}, + // Existence-only: exercise the decode path but don't assert + // bit-level equality (encoding not re-implemented in Go). + egBinaryGen{}, + egGeoHashGen{4, "#b"}, + egGeoHashGen{8, "#bb"}, + egGeoHashGen{24, "#bbbbb"}, + egGeoHashGen{48, "#bbbbbbbbbb"}, + // Three scales exercise distinct scale bytes + divisor paths. + newEgDecimal64Gen(18, 0), + newEgDecimal64Gen(18, 4), + newEgDecimal64Gen(18, 10), + egDecimal128Gen{}, + egDecimal256Gen{}, + egDoubleArrayGen{}, +} + +// egHashAscii is the Java hashAsciiString / hashBytes oracle. For ASCII +// input the two Java helpers agree (char vs byte&0xFF), so one Go hash +// over bytes serves both the expected (literal bytes) and observed +// (batch.Str bytes) sides. int64 overflow wraps two's-complement, +// matching Java long arithmetic. +func egHashAscii(b []byte) int64 { + h := int64(1125899906842597) // large prime seed + for _, c := range b { + h = h*31 + int64(c) + } + return h ^ int64(len(b)) // mix length so padding changes surface +} + +// egRandomASCII mirrors Java randomAsciiString: printable ASCII +// 0x20..0x7D minus 0x27 (single quote) to keep literal building simple. +func egRandomASCII(r *rand.Rand, n int) string { + if n <= 0 { + return "" + } + bs := make([]byte, n) + for i := 0; i < n; i++ { + var cp int + for { + cp = 0x20 + r.Intn(0x5E) + if cp != 0x27 { + break + } + } + bs[i] = byte(cp) + } + return string(bs) +} + +func egQuote(s string) string { return strings.ReplaceAll(s, "'", "''") } + +type egLongGen struct{} + +func (egLongGen) observedHash(b *QwpColumnBatch, col, row int) int64 { return b.Int64(col, row) } +func (egLongGen) randomValue(r *rand.Rand, out *egRandomValue) { + v := pickNonNullLong(r) // excludes the LONG_NULL sentinel + out.hash = v + out.literal = strconv.FormatInt(v, 10) + "L" +} +func (egLongGen) sqlType() string { return "LONG" } +func (egLongGen) supportsNull() bool { return true } + +type egIntGen struct{} + +func (egIntGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(b.Int32(col, row)) +} +func (egIntGen) randomValue(r *rand.Rand, out *egRandomValue) { + v := pickNonNullInt(r) // excludes the INT_NULL sentinel + out.hash = int64(v) + out.literal = strconv.Itoa(int(v)) +} +func (egIntGen) sqlType() string { return "INT" } +func (egIntGen) supportsNull() bool { return true } + +type egShortGen struct{} + +func (egShortGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(b.Int16(col, row)) +} +func (egShortGen) randomValue(r *rand.Rand, out *egRandomValue) { + v := int16(r.Intn(65535) - 32767) + out.hash = int64(v) + out.literal = "CAST(" + strconv.Itoa(int(v)) + " AS SHORT)" +} +func (egShortGen) sqlType() string { return "SHORT" } +func (egShortGen) supportsNull() bool { return false } + +type egByteGen struct{} + +func (egByteGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(b.Int8(col, row)) +} +func (egByteGen) randomValue(r *rand.Rand, out *egRandomValue) { + v := int8(r.Intn(255) - 127) + out.hash = int64(v) + out.literal = "CAST(" + strconv.Itoa(int(v)) + " AS BYTE)" +} +func (egByteGen) sqlType() string { return "BYTE" } +func (egByteGen) supportsNull() bool { return false } + +type egCharGen struct{} + +func (egCharGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(b.Char(col, row)) +} +func (egCharGen) randomValue(r *rand.Rand, out *egRandomValue) { + c := rune('A' + r.Intn(26)) + out.hash = int64(c) + out.literal = "'" + string(c) + "'" +} +func (egCharGen) sqlType() string { return "CHAR" } +func (egCharGen) supportsNull() bool { return false } + +type egDoubleGen struct{} + +func (egDoubleGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(math.Float64bits(b.Float64(col, row))) +} +func (egDoubleGen) randomValue(r *rand.Rand, out *egRandomValue) { + var v float64 + for { + v = (r.Float64() - 0.5) * 1e9 + if !math.IsNaN(v) && !math.IsInf(v, 0) { + break + } + } + out.hash = int64(math.Float64bits(v)) + // 17 significant digits round-trips a float64 bit-for-bit. + out.literal = "CAST(" + strconv.FormatFloat(v, 'e', 17, 64) + " AS DOUBLE)" +} +func (egDoubleGen) sqlType() string { return "DOUBLE" } +func (egDoubleGen) supportsNull() bool { return true } + +type egFloatGen struct{} + +func (egFloatGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(int32(math.Float32bits(b.Float32(col, row)))) +} +func (egFloatGen) randomValue(r *rand.Rand, out *egRandomValue) { + var v float32 + for { + v = (r.Float32() - 0.5) * 1e5 + if !math.IsNaN(float64(v)) && !math.IsInf(float64(v), 0) { + break + } + } + out.hash = int64(int32(math.Float32bits(v))) + // 9 significant digits round-trips a float32 bit-for-bit. + out.literal = "CAST(" + strconv.FormatFloat(float64(v), 'e', 8, 32) + " AS FLOAT)" +} +func (egFloatGen) sqlType() string { return "FLOAT" } +func (egFloatGen) supportsNull() bool { return true } + +type egBooleanGen struct{} + +func (egBooleanGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + if b.Bool(col, row) { + return 1 + } + return 0 +} +func (egBooleanGen) randomValue(r *rand.Rand, out *egRandomValue) { + v := r.Intn(2) == 0 + if v { + out.hash = 1 + } else { + out.hash = 0 + } + out.literal = strconv.FormatBool(v) +} +func (egBooleanGen) sqlType() string { return "BOOLEAN" } +func (egBooleanGen) supportsNull() bool { return false } + +type egSymbolGen struct { + pool []string +} + +func newEgSymbolGen(tag string, n int) egSymbolGen { + p := make([]string, n) + for i := 0; i < n; i++ { + p[i] = "s_" + tag + "_" + strconv.Itoa(i) + } + return egSymbolGen{pool: p} +} +func (g egSymbolGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + v := b.Str(col, row) + if v == nil { + return 0 + } + return egHashAscii(v) +} +func (g egSymbolGen) randomValue(r *rand.Rand, out *egRandomValue) { + s := g.pool[r.Intn(len(g.pool))] + out.hash = egHashAscii([]byte(s)) + out.literal = "CAST('" + s + "' AS SYMBOL)" +} +func (g egSymbolGen) sqlType() string { return "SYMBOL" } +func (g egSymbolGen) supportsNull() bool { return true } + +type egVarcharGen struct{} + +func (egVarcharGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + v := b.Str(col, row) + if v == nil { + return 0 + } + return egHashAscii(v) +} +func (egVarcharGen) randomValue(r *rand.Rand, out *egRandomValue) { + // Mix short inlinable (<=9 bytes) with longer heap-backed varchar. + s := egRandomASCII(r, r.Intn(30)) + out.hash = egHashAscii([]byte(s)) + out.literal = "CAST('" + egQuote(s) + "' AS VARCHAR)" +} +func (egVarcharGen) sqlType() string { return "VARCHAR" } +func (egVarcharGen) supportsNull() bool { return true } + +type egStringGen struct{} + +func (egStringGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + v := b.Str(col, row) + if v == nil { + return 0 + } + return egHashAscii(v) +} +func (egStringGen) randomValue(r *rand.Rand, out *egRandomValue) { + s := egRandomASCII(r, r.Intn(16)) + out.hash = egHashAscii([]byte(s)) + out.literal = "'" + egQuote(s) + "'" +} +func (egStringGen) sqlType() string { return "STRING" } +func (egStringGen) supportsNull() bool { return true } + +type egTimestampGen struct{} + +func (egTimestampGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.Int64(col, row) +} +func (egTimestampGen) randomValue(r *rand.Rand, out *egRandomValue) { + us := int64(r.Uint64()) & 0x0FFFFFFFFFFFFFFF // positive, representable + out.hash = us + out.literal = "CAST(" + strconv.FormatInt(us, 10) + " AS TIMESTAMP)" +} +func (egTimestampGen) sqlType() string { return "TIMESTAMP" } +func (egTimestampGen) supportsNull() bool { return true } + +type egTimestampNanosGen struct{} + +func (egTimestampNanosGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.Int64(col, row) +} +func (egTimestampNanosGen) randomValue(r *rand.Rand, out *egRandomValue) { + ns := int64(r.Uint64()) & 0x0FFFFFFFFFFFFFFF + out.hash = ns + out.literal = "CAST(" + strconv.FormatInt(ns, 10) + " AS TIMESTAMP_NS)" +} +func (egTimestampNanosGen) sqlType() string { return "TIMESTAMP_NS" } +func (egTimestampNanosGen) supportsNull() bool { return true } + +type egDateGen struct{} + +func (egDateGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.Int64(col, row) +} +func (egDateGen) randomValue(r *rand.Rand, out *egRandomValue) { + ms := int64(r.Uint64()) & 0x0000FFFFFFFFFFFF // fits comfortably as a Date + out.hash = ms + out.literal = "CAST(" + strconv.FormatInt(ms, 10) + " AS DATE)" +} +func (egDateGen) sqlType() string { return "DATE" } +func (egDateGen) supportsNull() bool { return true } + +type egIpv4Gen struct{} + +func (egIpv4Gen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return int64(uint32(b.Int32(col, row))) +} +func (egIpv4Gen) randomValue(r *rand.Rand, out *egRandomValue) { + a := 1 + r.Intn(254) + b := r.Intn(256) + c := r.Intn(256) + d := 1 + r.Intn(254) // last octet non-zero to avoid the NULL match + out.hash = (int64(a) << 24) | (int64(b) << 16) | (int64(c) << 8) | int64(d) + out.literal = fmt.Sprintf("CAST('%d.%d.%d.%d' AS IPv4)", a, b, c, d) +} +func (egIpv4Gen) sqlType() string { return "IPv4" } +func (egIpv4Gen) supportsNull() bool { return true } + +type egUuidGen struct{} + +func (egUuidGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.UuidHi(col, row) ^ b.UuidLo(col, row) +} +func (egUuidGen) randomValue(r *rand.Rand, out *egRandomValue) { + hi := int64(r.Uint64()) + lo := int64(r.Uint64()) + // Avoid the QuestDB UUID NULL sentinel (both halves Long.MIN_VALUE). + if hi == math.MinInt64 && lo == math.MinInt64 { + lo = 0 + } + out.hash = hi ^ lo + out.literal = "CAST('" + egUUIDCanonical(hi, lo) + "' AS UUID)" +} +func (egUuidGen) sqlType() string { return "UUID" } +func (egUuidGen) supportsNull() bool { return true } + +// egUUIDCanonical replicates java.util.UUID.toString for a (mostSig, +// leastSig) pair so the SQL CAST yields exactly the intended 128 bits. +func egUUIDCanonical(hi, lo int64) string { + h := uint64(hi) + l := uint64(lo) + return fmt.Sprintf("%08x-%04x-%04x-%04x-%012x", + h>>32, (h>>16)&0xffff, h&0xffff, l>>48, l&0xffffffffffff) +} + +type egLong256Gen struct{} + +func (egLong256Gen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.Long256Word(col, row, 0) ^ b.Long256Word(col, row, 1) ^ + b.Long256Word(col, row, 2) ^ b.Long256Word(col, row, 3) +} +func (egLong256Gen) randomValue(r *rand.Rand, out *egRandomValue) { + var w [4]int64 + for i := 0; i < 4; i++ { + w[i] = int64(r.Uint64()) + } + var sb strings.Builder + sb.WriteString("CAST('0x") + // Big-endian hex: w[3] high bytes ... w[0] low bytes. + for i := 3; i >= 0; i-- { + sb.WriteString(fmt.Sprintf("%016x", uint64(w[i]))) + } + sb.WriteString("' AS LONG256)") + out.hash = w[0] ^ w[1] ^ w[2] ^ w[3] + out.literal = sb.String() +} +func (egLong256Gen) sqlType() string { return "LONG256" } +func (egLong256Gen) supportsNull() bool { return true } + +type egBinaryGen struct{} + +const egBinaryFixedLen = 12 + +func (egBinaryGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + v := b.Binary(col, row) + if v == nil { + return 0 + } + return int64(len(v)) +} +func (egBinaryGen) randomValue(r *rand.Rand, out *egRandomValue) { + out.hash = egBinaryFixedLen + // rnd_bin produces random bytes at INSERT time -- value isn't known + // client-side, only its fixed length is. + out.literal = fmt.Sprintf("rnd_bin(%d, %d, 0)", egBinaryFixedLen, egBinaryFixedLen) +} +func (egBinaryGen) sqlType() string { return "BINARY" } +func (egBinaryGen) supportsNull() bool { return false } + +type egGeoHashGen struct { + precisionBits int + literal string +} + +func (g egGeoHashGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + // No geohash scalar accessor on the Go batch surface. The frame + // having decoded (this code runs only on non-null cells, and the + // null-bitmap is asserted separately) is the existence guarantee, + // matching Java's discard-the-value getGeohashValue() call. + return 1 +} +func (g egGeoHashGen) randomValue(r *rand.Rand, out *egRandomValue) { + out.hash = 1 + out.literal = g.literal +} +func (g egGeoHashGen) sqlType() string { return fmt.Sprintf("GEOHASH(%db)", g.precisionBits) } +func (g egGeoHashGen) supportsNull() bool { return true } + +// egDecimal64Gen: value*10^scale stored as a long, so the on-wire bits +// are known and CAN be bit-verified. Scale is captured at construction. +type egDecimal64Gen struct { + precision int + scale int + divisor int64 +} + +func newEgDecimal64Gen(precision, scale int) egDecimal64Gen { + if scale < 0 || scale > 18 || scale > precision { + panic(fmt.Sprintf("bad DECIMAL64 (p=%d, s=%d)", precision, scale)) + } + d := int64(1) + for i := 0; i < scale; i++ { + d *= 10 + } + return egDecimal64Gen{precision: precision, scale: scale, divisor: d} +} +func (g egDecimal64Gen) observedHash(b *QwpColumnBatch, col, row int) int64 { + return b.Int64(col, row) +} +func (g egDecimal64Gen) randomValue(r *rand.Rand, out *egRandomValue) { + // Scaled long: the on-wire bits. 6-digit magnitude keeps literal + // construction cheap; the bit-level assertion is magnitude-agnostic. + scaled := int64(r.Intn(1_000_000)) - 500_000 + out.hash = scaled + out.literal = g.toDecimalLiteral(scaled) +} +func (g egDecimal64Gen) sqlType() string { + return fmt.Sprintf("DECIMAL(%d,%d)", g.precision, g.scale) +} +func (g egDecimal64Gen) supportsNull() bool { return true } +func (g egDecimal64Gen) toDecimalLiteral(scaled int64) string { + if g.scale == 0 { + return strconv.FormatInt(scaled, 10) + "m" + } + negative := scaled < 0 + abs := scaled + if negative { + abs = -abs + } + whole := abs / g.divisor + frac := abs % g.divisor + var sb strings.Builder + if negative { + sb.WriteByte('-') + } + sb.WriteString(strconv.FormatInt(whole, 10)) + sb.WriteByte('.') + fs := strconv.FormatInt(frac, 10) + for i := 0; i < g.scale-len(fs); i++ { + sb.WriteByte('0') + } + sb.WriteString(fs) + sb.WriteByte('m') + return sb.String() +} + +type egDecimal128Gen struct{} + +var egDecimal128Literals = []string{ + "1.000001m", "2.500500m", "1234567.123456m", "-999999.999999m", +} + +func (egDecimal128Gen) observedHash(b *QwpColumnBatch, col, row int) int64 { + b.Decimal128Lo(col, row) + b.Decimal128Hi(col, row) + return 1 +} +func (egDecimal128Gen) randomValue(r *rand.Rand, out *egRandomValue) { + out.hash = 1 + out.literal = egDecimal128Literals[r.Intn(len(egDecimal128Literals))] +} +func (egDecimal128Gen) sqlType() string { return "DECIMAL(38,6)" } +func (egDecimal128Gen) supportsNull() bool { return true } + +type egDecimal256Gen struct{} + +var egDecimal256Literals = []string{ + "1.0000000001m", "100.1234567890m", "-1.5m", "99999999.0000000001m", +} + +func (egDecimal256Gen) observedHash(b *QwpColumnBatch, col, row int) int64 { + for w := 0; w < 4; w++ { + b.Long256Word(col, row, w) + } + return 1 +} +func (egDecimal256Gen) randomValue(r *rand.Rand, out *egRandomValue) { + out.hash = 1 + out.literal = egDecimal256Literals[r.Intn(len(egDecimal256Literals))] +} +func (egDecimal256Gen) sqlType() string { return "DECIMAL(76,10)" } +func (egDecimal256Gen) supportsNull() bool { return true } + +type egDoubleArrayGen struct{} + +func (egDoubleArrayGen) observedHash(b *QwpColumnBatch, col, row int) int64 { + arr := b.Float64Array(col, row) + if arr == nil { + return 0 + } + return int64(len(arr)) +} +func (egDoubleArrayGen) randomValue(r *rand.Rand, out *egRandomValue) { + n := 1 + r.Intn(4) + var sb strings.Builder + sb.WriteString("ARRAY[") + for i := 0; i < n; i++ { + if i > 0 { + sb.WriteString(", ") + } + d := (r.Float64() - 0.5) * 100 + sb.WriteString("CAST(") + sb.WriteString(strconv.FormatFloat(d, 'e', 17, 64)) + sb.WriteString(" AS DOUBLE)") + } + sb.WriteByte(']') + out.hash = int64(n) + out.literal = sb.String() +} +func (egDoubleArrayGen) sqlType() string { return "DOUBLE[]" } +func (egDoubleArrayGen) supportsNull() bool { return true } + +// --- query planning --------------------------------------------------- + +// egQueryPlan describes one random query: SQL text, resultCol->origCol +// map, the inclusive 1-based row-id range that should appear, and +// whether rows come back descending. +type egQueryPlan struct { + sql string + colMap []int + firstRow int + lastRow int + descending bool +} + +func egIdentity(n int) []int { + a := make([]int, n) + for i := range a { + a[i] = i + } + return a +} + +func egAllDataCols(colCount int) string { + var sb strings.Builder + for i := 0; i < colCount; i++ { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteByte('c') + sb.WriteString(strconv.Itoa(i)) + } + return sb.String() +} + +func egPickRowCount(r *rand.Rand) int { + // Skewed distribution hitting small, mid, and batch-boundary sizes. + choices := []int{1, 2, 7, 64, 257, egressFuzzMaxRowsPerCase - 1, egressFuzzMaxRowsPerCase} + return choices[r.Intn(len(choices))] +} + +// egJavaStringHashCode reproduces java.lang.String.hashCode so the +// per-table shape rotation matches the Java test's caseSalt semantics. +func egJavaStringHashCode(s string) int32 { + var h int32 + for i := 0; i < len(s); i++ { + h = 31*h + int32(s[i]) + } + return h +} + +func egFloorMod(x, m int) int { return ((x % m) + m) % m } + +func egPlanQuery(r *rand.Rand, table string, colCount, rowCount, caseIdx int) egQueryPlan { + // 4 shapes rotate deterministically so every shape is exercised + // across iterations regardless of seed. + shape := egFloorMod(caseIdx, 4) + if rowCount < 4 { + shape = 0 // small cases: just scan everything + } + + switch shape { + case 1: // projection subset in scrambled order + pickCount := 1 + r.Intn(colCount) + used := make([]bool, colCount) + m := make([]int, pickCount) + for i := 0; i < pickCount; i++ { + var pick int + for { + pick = r.Intn(colCount) + if !used[pick] { + break + } + } + used[pick] = true + m[i] = pick + } + var sql strings.Builder + sql.WriteString("SELECT ") + for i := 0; i < pickCount; i++ { + if i > 0 { + sql.WriteString(", ") + } + sql.WriteByte('c') + sql.WriteString(strconv.Itoa(m[i])) + } + sql.WriteString(" FROM ") + sql.WriteString(table) + sql.WriteString(" ORDER BY id") + return egQueryPlan{sql.String(), m, 1, rowCount, false} + case 2: // id-range filter -- null-bitmap handling across dropped rows + lo := 1 + r.Intn(rowCount) + hi := lo + r.Intn(max(1, rowCount-lo+1)) + sql := "SELECT " + egAllDataCols(colCount) + " FROM " + table + + " WHERE id >= " + strconv.Itoa(lo) + " AND id <= " + strconv.Itoa(hi) + + " ORDER BY id" + return egQueryPlan{sql, egIdentity(colCount), lo, hi, false} + case 3: // reverse + LIMIT -- last K rows, descending + k := 1 + r.Intn(rowCount) + sql := "SELECT " + egAllDataCols(colCount) + " FROM " + table + + " ORDER BY id DESC LIMIT " + strconv.Itoa(k) + return egQueryPlan{sql, egIdentity(colCount), rowCount - k + 1, rowCount, true} + default: + sql := "SELECT " + egAllDataCols(colCount) + " FROM " + table + " ORDER BY id" + return egQueryPlan{sql, egIdentity(colCount), 1, rowCount, false} + } +} + +// --- per-cell verification -------------------------------------------- + +type egAssertionState struct { + plan egQueryPlan + cols []egColumnGenerator + expected [][]int64 + expectedNull [][]bool + observed int +} + +func (s *egAssertionState) observe(t *testing.T, b *QwpColumnBatch) { + t.Helper() + n := b.RowCount() + resultColCount := len(s.plan.colMap) + for rr := 0; rr < n; rr++ { + var logicalRow int + if s.plan.descending { + logicalRow = s.plan.lastRow - s.observed + } else { + logicalRow = s.plan.firstRow + s.observed + } + rowIdx := logicalRow - 1 + for rc := 0; rc < resultColCount; rc++ { + origCol := s.plan.colMap[rc] + ctx := fmt.Sprintf("row=%d resultCol=%d origCol=%d type=%s sql=%s", + logicalRow, rc, origCol, s.cols[origCol].sqlType(), s.plan.sql) + if s.expectedNull[rowIdx][origCol] { + if !b.IsNull(rc, rr) { + t.Fatalf("expected NULL: %s", ctx) + } + } else { + if b.IsNull(rc, rr) { + t.Fatalf("expected non-NULL: %s", ctx) + } + got := s.cols[origCol].observedHash(b, rc, rr) + if want := s.expected[rowIdx][origCol]; got != want { + t.Fatalf("value mismatch: %s want=%d got=%d", ctx, want, got) + } + } + } + s.observed++ + } +} + +func (s *egAssertionState) end(t *testing.T, totalRows int64) { + t.Helper() + expectedRows := s.plan.lastRow - s.plan.firstRow + 1 + if totalRows != int64(expectedRows) { + t.Fatalf("row count (TotalRows) for %s: want %d got %d", + s.plan.sql, expectedRows, totalRows) + } + if s.observed != expectedRows { + t.Fatalf("row count (observed) for %s: want %d got %d", + s.plan.sql, expectedRows, s.observed) + } +} + +// --- one fuzz case ---------------------------------------------------- + +func newEgressClient(t *testing.T, srv *qwpFuzzServer) *QwpQueryClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := QwpQueryClientFromConf(ctx, srv.connConf()) + if err != nil { + t.Fatalf("QwpQueryClientFromConf(%q): %v", srv.connConf(), err) + } + return c +} + +func closeEgressClient(c *QwpQueryClient) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = c.Close(ctx) +} + +// egInsertRows builds and runs the multi-row INSERT, split into +// length-budgeted sub-statements. id = r+1; ts = r*1000us (1ms/row) +// keeps the whole run inside one partition for any practical row count. +func egInsertRows(t *testing.T, srv *qwpFuzzServer, table string, + colCount int, literals [][]string, rowCount int) { + t.Helper() + prefix := "INSERT INTO " + table + " VALUES " + var sb strings.Builder + rowsInChunk := 0 + flush := func() { + if rowsInChunk == 0 { + return + } + srv.mustExec(t, sb.String()) + sb.Reset() + rowsInChunk = 0 + } + for rIdx := 0; rIdx < rowCount; rIdx++ { + var row strings.Builder + row.WriteByte('(') + row.WriteString(strconv.Itoa(rIdx + 1)) + row.WriteString(", CAST(") + row.WriteString(strconv.FormatInt(int64(rIdx)*1000, 10)) + row.WriteString(" AS TIMESTAMP)") + for c := 0; c < colCount; c++ { + row.WriteString(", ") + row.WriteString(literals[rIdx][c]) + } + row.WriteByte(')') + + if rowsInChunk > 0 && sb.Len()+2+row.Len() > egInsertChunkBudget { + flush() + } + if rowsInChunk == 0 { + sb.WriteString(prefix) + } else { + sb.WriteString(", ") + } + sb.WriteString(row.String()) + rowsInChunk++ + } + flush() +} + +func egRunOneCase(t *testing.T, srv *qwpFuzzServer, c *QwpQueryClient, + table string, colCount int, r *rand.Rand) { + t.Helper() + + cols := make([]egColumnGenerator, colCount) + nullable := make([]bool, colCount) + for i := 0; i < colCount; i++ { + cols[i] = egGenerators[r.Intn(len(egGenerators))] + nullable[i] = cols[i].supportsNull() && r.Intn(2) == 0 + } + rowCount := egPickRowCount(r) + + // id anchors ORDER BY; ts is the designated timestamp so the table + // runs as WAL (matches production; DROP goes through WAL apply). + var ddl strings.Builder + ddl.WriteString("CREATE TABLE ") + ddl.WriteString(table) + ddl.WriteString(" (id LONG, ts TIMESTAMP") + for i := 0; i < colCount; i++ { + ddl.WriteString(", c") + ddl.WriteString(strconv.Itoa(i)) + ddl.WriteByte(' ') + ddl.WriteString(cols[i].sqlType()) + } + ddl.WriteString(") TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.mustExec(t, ddl.String()) + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+table+"'") + + // Roll values in Go; remember expected hash + null-ness per cell. + expected := make([][]int64, rowCount) + expectedNull := make([][]bool, rowCount) + literals := make([][]string, rowCount) + var buf egRandomValue + for rr := 0; rr < rowCount; rr++ { + expected[rr] = make([]int64, colCount) + expectedNull[rr] = make([]bool, colCount) + literals[rr] = make([]string, colCount) + for cc := 0; cc < colCount; cc++ { + isNull := nullable[cc] && r.Intn(5) == 0 + if isNull { + expectedNull[rr][cc] = true + literals[rr][cc] = "CAST(NULL AS " + cols[cc].sqlType() + ")" + } else { + cols[cc].randomValue(r, &buf) + expected[rr][cc] = buf.hash + literals[rr][cc] = buf.literal + } + } + } + + egInsertRows(t, srv, table, colCount, literals, rowCount) + // WAL tables commit asynchronously; wait for the apply job before + // the SELECT or we'd race the stream against an empty table view. + srv.awaitRows(t, table, rowCount, 60*time.Second) + + caseSalt := int(egJavaStringHashCode(table)) + plan := egPlanQuery(r, table, colCount, rowCount, caseSalt) + + state := &egAssertionState{ + plan: plan, cols: cols, expected: expected, expectedNull: expectedNull, + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + q := c.Query(ctx, plan.sql) + defer q.Close() + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("egress error [%s]: %v", table, err) + } + state.observe(t, batch) + } + state.end(t, q.TotalRows()) +} + +// --- @Test entry points ----------------------------------------------- + +// TestQwpFuzzEgressRandomSchemaRoundtrip is the main sweep: a fresh +// connection per case so state pollution can't mask a bug. +func TestQwpFuzzEgressRandomSchemaRoundtrip(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + for i := 0; i < 15; i++ { + func() { + c := newEgressClient(t, srv) + defer closeEgressClient(c) + egRunOneCase(t, srv, c, fmt.Sprintf("egfz_iter_%d", i), 1+r.Intn(6), r) + }() + } +} + +// TestQwpFuzzEgressBackToBackSameConnection exercises per-connection +// state that survives across queries: the conn symbol dict, schema +// registry, and Gorilla decoder state. +func TestQwpFuzzEgressBackToBackSameConnection(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + c := newEgressClient(t, srv) + defer closeEgressClient(c) + for q := 0; q < 12; q++ { + egRunOneCase(t, srv, c, fmt.Sprintf("egfz_back_%d", q), 1+r.Intn(4), r) + } +} + +// TestQwpFuzzEgressWideTables stresses the batch buffer's per-column +// state arrays and the schema block encoder with 10-16 columns. +func TestQwpFuzzEgressWideTables(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + c := newEgressClient(t, srv) + defer closeEgressClient(c) + egRunOneCase(t, srv, c, "egfz_wide", 10+r.Intn(7), r) +} + +// --- select / alter sequence fuzz ------------------------------------- + +func egCatCount(totalRows, kMod int) int64 { + if kMod == 0 { + return int64(totalRows / 4) + } + return int64((totalRows + 4 - kMod) / 4) +} + +func egCatFor(id int64) byte { return "abcd"[id%4] } + +func egExpectedV(id int64) float64 { return float64(id) * 1.5 } + +func egExpectedTs(id, spacingMicros int64) int64 { return (id - 1) * spacingMicros } + +// egAssertRows drives client.Query(sql) and dispatches every batch to +// verifier, which returns the running total of rows checked. After the +// stream ends both the server-reported total and the observed total +// must equal expected. Per-cell assertions live in the verifier. +func egAssertRows(t *testing.T, c *QwpQueryClient, sql string, expected int64, + verify func(b *QwpColumnBatch, startRow int64) int64) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + q := c.Query(ctx, sql) + defer q.Close() + var seen int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("query failed [%s]: %v", sql, err) + } + seen = verify(batch, seen) + } + if got := q.TotalRows(); got != expected { + t.Fatalf("row count (TotalRows) [%s]: want %d got %d", sql, expected, got) + } + if seen != expected { + t.Fatalf("row count (observed) [%s]: want %d got %d", sql, expected, seen) + } +} + +func egVerifyBaseColumn(t *testing.T, b *QwpColumnBatch, col, batchRow int, + name string, id, spacingMicros int64, tag string) { + t.Helper() + switch name { + case "id": + if got := b.Int64(col, batchRow); got != id { + t.Fatalf("%s id @ id=%d: want %d got %d", tag, id, id, got) + } + case "v": + if got := b.Float64(col, batchRow); got != egExpectedV(id) { + t.Fatalf("%s v @ id=%d: want %v got %v", tag, id, egExpectedV(id), got) + } + case "cat": + seq := b.Str(col, batchRow) + if seq == nil { + t.Fatalf("%s cat must not be NULL @ id=%d", tag, id) + } + if len(seq) != 1 { + t.Fatalf("%s cat byte length @ id=%d: want 1 got %d", tag, id, len(seq)) + } + if seq[0] != egCatFor(id) { + t.Fatalf("%s cat char @ id=%d: want %q got %q", + tag, id, egCatFor(id), seq[0]) + } + case "ts": + if got := b.Int64(col, batchRow); got != egExpectedTs(id, spacingMicros) { + t.Fatalf("%s ts @ id=%d: want %d got %d", + tag, id, egExpectedTs(id, spacingMicros), got) + } + default: + t.Fatalf("%s unknown base column: %s", tag, name) + } +} + +// egAwaitColumnCount polls table_columns until the column count matches +// want. This is the network-client analog of the Java test's +// server.awaitTable() after a structural ALTER: it blocks until the WAL +// apply job has materialised the ADD/DROP COLUMN so a subsequent SELECT +// *'s column set is stable. +func egAwaitColumnCount(t *testing.T, srv *qwpFuzzServer, table string, + want int, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + q := fmt.Sprintf("SELECT count() FROM table_columns('%s')", table) + for { + res, err := srv.execSQL(q) + if err == nil && len(res.Dataset) == 1 && len(res.Dataset[0]) == 1 { + if n, ok := toInt64(res.Dataset[0][0]); ok && n == int64(want) { + return + } + } + if time.Now().After(deadline) { + t.Fatalf("timeout: table %q did not reach %d columns within %s", + table, want, timeout) + } + time.Sleep(100 * time.Millisecond) + } +} + +// egRunSelectShape runs one of six SELECT shapes against the stable +// fz_seq table and asserts BOTH the row count AND per-cell correctness. +// Shapes span both server cursor paths: PageFrameCursor (plain / +// predicate / interval / projection / star) and RecordCursor (GROUP BY). +func egRunSelectShape(t *testing.T, srv *qwpFuzzServer, c *QwpQueryClient, + r *rand.Rand, shape, totalRows int, spacingMicros int64, liveAdded []string) { + t.Helper() + switch shape { + case 0: // plain full scan, ts-ordered -> id-ordered; globalRow N -> id N+1 + egAssertRows(t, c, "SELECT id FROM fz_seq", int64(totalRows), + func(b *QwpColumnBatch, startRow int64) int64 { + n := b.RowCount() + for rr := 0; rr < n; rr++ { + expectedId := startRow + int64(rr) + 1 + if got := b.Int64(0, rr); got != expectedId { + t.Fatalf("shape 0 id @ row %d: want %d got %d", + startRow+int64(rr), expectedId, got) + } + } + return startRow + int64(n) + }) + case 1: // id-range predicate, random threshold + threshold := 1 + r.Intn(max(1, totalRows-1)) + expected := int64(totalRows - threshold) + egAssertRows(t, c, + fmt.Sprintf("SELECT id, v FROM fz_seq WHERE id > %d", threshold), + expected, func(b *QwpColumnBatch, startRow int64) int64 { + n := b.RowCount() + for rr := 0; rr < n; rr++ { + expectedId := int64(threshold) + startRow + int64(rr) + 1 + if got := b.Int64(0, rr); got != expectedId { + t.Fatalf("shape 1 id @ row %d: want %d got %d", + startRow+int64(rr), expectedId, got) + } + if got := b.Float64(1, rr); got != egExpectedV(expectedId) { + t.Fatalf("shape 1 v @ row %d: want %v got %v", + startRow+int64(rr), egExpectedV(expectedId), got) + } + } + return startRow + int64(n) + }) + case 2: // GROUP BY -- RecordCursor path; cat cycles 4 symbols -> 4 rows + counts := make(map[byte]int64) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT cat, COUNT(*) c FROM fz_seq") + for batch, err := range q.Batches() { + if err != nil { + q.Close() + t.Fatalf("shape 2 query failed: %v", err) + } + for rr := 0; rr < batch.RowCount(); rr++ { + seq := batch.Str(0, rr) + if seq == nil { + q.Close() + t.Fatalf("shape 2 cat must not be NULL") + } + if len(seq) != 1 { + q.Close() + t.Fatalf("shape 2 cat byte length: want 1 got %d", len(seq)) + } + counts[seq[0]] = batch.Int64(1, rr) + } + } + q.Close() + if len(counts) != 4 { + t.Fatalf("shape 2 distinct cat count: want 4 got %d", len(counts)) + } + for k, kMod := range map[byte]int{'a': 0, 'b': 1, 'c': 2, 'd': 3} { + if got, want := counts[k], egCatCount(totalRows, kMod); got != want { + t.Fatalf("shape 2 count(%q): want %d got %d", k, want, got) + } + } + case 3: // interval on designated ts -- PageFrameCursor + partition skip + loRow := 1 + r.Intn(max(1, totalRows-2)) + span := 1 + r.Intn(max(1, totalRows-loRow)) + hiRow := loRow + span + tsLo := int64(loRow-1) * spacingMicros + tsHi := int64(hiRow-1) * spacingMicros + egAssertRows(t, c, + fmt.Sprintf("SELECT id FROM fz_seq WHERE ts >= CAST(%d AS TIMESTAMP) "+ + "AND ts < CAST(%d AS TIMESTAMP)", tsLo, tsHi), + int64(span), func(b *QwpColumnBatch, startRow int64) int64 { + n := b.RowCount() + for rr := 0; rr < n; rr++ { + expectedId := int64(loRow) + startRow + int64(rr) + if got := b.Int64(0, rr); got != expectedId { + t.Fatalf("shape 3 id @ row %d: want %d got %d", + startRow+int64(rr), expectedId, got) + } + } + return startRow + int64(n) + }) + case 4: // random projection of the stable base columns + base := []string{"id", "v", "cat", "ts"} + pickCount := 1 + r.Intn(len(base)) + shuffled := append([]string(nil), base...) + for i := len(shuffled) - 1; i > 0; i-- { + j := r.Intn(i + 1) + shuffled[i], shuffled[j] = shuffled[j], shuffled[i] + } + projection := shuffled[:pickCount] + sql := "SELECT " + strings.Join(projection, ", ") + " FROM fz_seq ORDER BY id" + egAssertRows(t, c, sql, int64(totalRows), + func(b *QwpColumnBatch, startRow int64) int64 { + if b.ColumnCount() != len(projection) { + t.Fatalf("shape 4 column count: want %d got %d", + len(projection), b.ColumnCount()) + } + n := b.RowCount() + for rr := 0; rr < n; rr++ { + id := startRow + int64(rr) + 1 + for cc := 0; cc < len(projection); cc++ { + egVerifyBaseColumn(t, b, cc, rr, projection[cc], + id, spacingMicros, "shape 4") + } + } + return startRow + int64(n) + }) + case 5: // SELECT * -- column set follows ADD / DROP automatically + expectedExtras := len(liveAdded) + egAssertRows(t, c, "SELECT * FROM fz_seq", int64(totalRows), + func(b *QwpColumnBatch, startRow int64) int64 { + if b.ColumnCount() != 4+expectedExtras { + t.Fatalf("shape 5 column count: want %d got %d", + 4+expectedExtras, b.ColumnCount()) + } + n := b.RowCount() + for rr := 0; rr < n; rr++ { + id := startRow + int64(rr) + 1 + egVerifyBaseColumn(t, b, 0, rr, "id", id, spacingMicros, "shape 5") + egVerifyBaseColumn(t, b, 1, rr, "v", id, spacingMicros, "shape 5") + egVerifyBaseColumn(t, b, 2, rr, "cat", id, spacingMicros, "shape 5") + egVerifyBaseColumn(t, b, 3, rr, "ts", id, spacingMicros, "shape 5") + for cc := 4; cc < 4+expectedExtras; cc++ { + if !b.IsNull(cc, rr) { + t.Fatalf("shape 5 extra col %d @ row %d must be NULL", + cc, startRow+int64(rr)) + } + } + } + return startRow + int64(n) + }) + default: + t.Fatalf("unknown shape: %d", shape) + } +} + +// TestQwpFuzzEgressSelectAlterSequence fuzzes sequences of SELECT / +// ALTER TABLE ADD|DROP COLUMN against one stable table, mixing six +// SELECT shapes in random order with occasional schema evolutions. Each +// ALTER stamps a new tableId and invalidates the server's compile +// cache, so the next SELECT with the same SQL text must detect the +// stale factory and recompile. Added columns are left NULL. +func TestQwpFuzzEgressSelectAlterSequence(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + + rowCount := 50 + r.Intn(951) + // Spacing options (microseconds) stress different partition + // densities for the designated-ts interval predicate. + spacingChoices := []int64{ + 300_000_000, 864_000_000, 3_600_000_000, 21_600_000_000, + } + spacingMicros := spacingChoices[r.Intn(len(spacingChoices))] + opCount := 15 + r.Intn(26) + structuralProbPermil := 150 + r.Intn(251) + maxLiveAddedColumns := 2 + r.Intn(5) + t.Logf("select/alter sequence fuzz: rowCount=%d spacingMicros=%d opCount=%d "+ + "structuralProbPermil=%d maxLiveAddedColumns=%d", + rowCount, spacingMicros, opCount, structuralProbPermil, maxLiveAddedColumns) + + srv.mustExec(t, "DROP TABLE IF EXISTS 'fz_seq'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS 'fz_seq'") + srv.mustExec(t, "CREATE TABLE fz_seq(id LONG, v DOUBLE, cat SYMBOL, ts TIMESTAMP) "+ + "TIMESTAMP(ts) PARTITION BY DAY WAL") + srv.mustExec(t, fmt.Sprintf("INSERT INTO fz_seq SELECT x, x * 1.5, "+ + "CASE WHEN x %% 4 = 0 THEN 'a' WHEN x %% 4 = 1 THEN 'b' "+ + "WHEN x %% 4 = 2 THEN 'c' ELSE 'd' END, "+ + "CAST((x - 1) * %d AS TIMESTAMP) FROM long_sequence(%d)", + spacingMicros, rowCount)) + srv.awaitRows(t, "fz_seq", rowCount, 90*time.Second) + + liveAdded := make([]string, 0, maxLiveAddedColumns) + nextColumnId := 0 + + c := newEgressClient(t, srv) + defer closeEgressClient(c) + + // Seed the cache with a SELECT we'll rerun, so the first structural + // op actually invalidates something. + egRunSelectShape(t, srv, c, r, 0, rowCount, spacingMicros, liveAdded) + + for op := 0; op < opCount; op++ { + structural := r.Intn(1000) < structuralProbPermil + if structural { + canAdd := len(liveAdded) < maxLiveAddedColumns + canDrop := len(liveAdded) > 0 + doAdd := (canAdd && !canDrop) || (canAdd && r.Intn(10) < 6) + if doAdd { + newCol := fmt.Sprintf("extra_%d", nextColumnId) + nextColumnId++ + srv.mustExec(t, "ALTER TABLE fz_seq ADD COLUMN "+newCol+" VARCHAR") + liveAdded = append(liveAdded, newCol) + egAwaitColumnCount(t, srv, "fz_seq", 4+len(liveAdded), 60*time.Second) + t.Logf("[op=%d] ALTER ADD %s", op, newCol) + } else if canDrop { + idx := r.Intn(len(liveAdded)) + victim := liveAdded[idx] + liveAdded = append(liveAdded[:idx], liveAdded[idx+1:]...) + srv.mustExec(t, "ALTER TABLE fz_seq DROP COLUMN "+victim) + egAwaitColumnCount(t, srv, "fz_seq", 4+len(liveAdded), 60*time.Second) + t.Logf("[op=%d] ALTER DROP %s", op, victim) + } else { + egRunSelectShape(t, srv, c, r, r.Intn(6), rowCount, spacingMicros, liveAdded) + } + } else { + egRunSelectShape(t, srv, c, r, r.Intn(6), rowCount, spacingMicros, liveAdded) + } + } +} diff --git a/qwp_encoder.go b/qwp_encoder.go index 4fb0ff25..aa103e6d 100644 --- a/qwp_encoder.go +++ b/qwp_encoder.go @@ -26,14 +26,14 @@ package questdb import "fmt" -// qwpEncoder encodes qwpTableBuffer data into QWP v1 binary messages. +// qwpEncoder encodes qwpTableBuffer data into QWP binary messages. // It owns a reusable qwpWireBuffer to minimize allocations across // successive encode calls. // // Usage: // // var enc qwpEncoder -// msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) +// msg := enc.encodeTable(tb) // // msg is valid until the next encode call. type qwpEncoder struct { wb qwpWireBuffer @@ -52,13 +52,12 @@ type qwpEncoder struct { // slice references the encoder's internal buffer and is valid until // the next encode call. // -// schemaId is the connection-scoped schema identifier the server -// uses to register (full mode) or look up (reference mode) this -// table's column set. +// The production cursor sender never invokes this method — it goes +// through encodeMultiTableWithDeltaDict. encodeTable is retained as a +// single-table convenience for tests that build wire-format fixtures +// for the egress decoder. // -// Used for tests and single-table convenience; the production sender -// batches multiple tables through encodeMultiTableWithDeltaDict. Both -// paths set FLAG_DELTA_SYMBOL_DICT (the only symbol-encoding mode +// It sets FLAG_DELTA_SYMBOL_DICT (the only symbol-encoding mode // WebSocket clients emit) and FLAG_GORILLA (timestamp columns are // always preceded by a 1-byte encoding flag; see QWP spec §12). // @@ -66,8 +65,8 @@ type qwpEncoder struct { // // Header (12 bytes, flags=0x0C) → empty DeltaDict → // TableBlock → patched PayloadLength. -func (e *qwpEncoder) encodeTable(tb *qwpTableBuffer, schemaMode qwpSchemaMode, schemaId int) []byte { - return e.encodeTableWithDeltaDict(tb, nil, -1, -1, schemaMode, schemaId) +func (e *qwpEncoder) encodeTable(tb *qwpTableBuffer) []byte { + return e.encodeTableWithDeltaDict(tb, nil, -1, -1) } // encodeTableWithDeltaDict encodes a single table buffer with a @@ -88,38 +87,33 @@ func (e *qwpEncoder) encodeTableWithDeltaDict( globalDict []string, maxSentId int, batchMaxId int, - schemaMode qwpSchemaMode, - schemaId int, ) []byte { e.wb.reset() e.writeHeader(e.headerFlags(), 1) e.writeDeltaDict(globalDict, maxSentId, batchMaxId) - e.writeTableBlock(tb, schemaMode, schemaId) + e.writeTableBlock(tb) e.patchPayloadLength() return e.wb.bytes() } -// qwpTableEncodeInfo carries per-table encoding parameters for -// multi-table message encoding. -type qwpTableEncodeInfo struct { - tb *qwpTableBuffer - schemaMode qwpSchemaMode - schemaId int -} - // encodeMultiTableWithDeltaDict encodes multiple table buffers into // a single QWP message with a shared delta symbol dictionary. The // header's tableCount field is set to len(tables), allowing the // server to process all tables from one WebSocket frame. This // reduces round-trips compared to one message per table. // +// Every table block carries its inline column definitions — +// cursor-architecture self-sufficient frames repeat the full schema +// on every frame so reconnect / replay stays safe against a freshly +// connected server. +// // The message layout is: // // Header (12 bytes, tableCount=N) → DeltaDict → // TableBlock₁ → TableBlock₂ → ... → TableBlockₙ → // patched PayloadLength. func (e *qwpEncoder) encodeMultiTableWithDeltaDict( - tables []qwpTableEncodeInfo, + tables []*qwpTableBuffer, globalDict []string, maxSentId int, batchMaxId int, @@ -137,7 +131,7 @@ func (e *qwpEncoder) encodeMultiTableWithDeltaDict( e.writeHeader(e.headerFlags(), uint16(len(tables))) e.writeDeltaDict(globalDict, maxSentId, batchMaxId) for i := range tables { - e.writeTableBlock(tables[i].tb, tables[i].schemaMode, tables[i].schemaId) + e.writeTableBlock(tables[i]) } e.patchPayloadLength() return e.wb.bytes() @@ -198,24 +192,19 @@ func (e *qwpEncoder) writeDeltaDict(globalDict []string, maxSentId, batchMaxId i // --- table block --- -// writeTableBlock writes a single table block: table name, row/col -// counts, schema, and column data. +// writeTableBlock writes a single table block: table name, row and +// column counts, the inline column schema, and the column data. // -// Per QWP spec §9, the schema section starts with a mode byte -// (0x00 = full, 0x01 = reference) followed by a varint schema_id -// in both modes. In full mode the column definitions follow; in -// reference mode the server looks up the schema by ID in its -// per-connection registry. -func (e *qwpEncoder) writeTableBlock(tb *qwpTableBuffer, schemaMode qwpSchemaMode, schemaId int) { +// Per the QWP ingress wire format the table block is table_name, +// row_count, col_count, inline columns (a name + type-code pair per +// column), then the per-column data. The schema is always inline — +// the wire carries no schema mode byte and no schema id. +func (e *qwpEncoder) writeTableBlock(tb *qwpTableBuffer) { e.wb.putString(tb.tableName) e.wb.putVarint(uint64(tb.rowCount)) e.wb.putVarint(uint64(len(tb.columns))) - e.wb.putByte(byte(schemaMode)) - e.wb.putVarint(uint64(schemaId)) - if schemaMode == qwpSchemaModeFull { - e.encodeSchemaFull(tb) - } + e.encodeSchemaFull(tb) for _, col := range tb.columns { e.encodeColumnData(col) @@ -355,6 +344,11 @@ func (e *qwpEncoder) encodeArrayColumn(col *qwpColumnBuffer) { // encodeTimestampColumn writes a timestamp column's payload. The wire // shape depends on whether FLAG_GORILLA is set at the message level: // +// Note: DATE is NOT routed here. Ingestion frames DATE as a plain +// int64 (matching the Java QwpColumnWriter); only server *egress* +// frames DATE timestamp-ish. The asymmetry is by protocol design — +// see the DATE case in qwp_query_decoder.go's parseColumn. +// // - FLAG_GORILLA on (default): a 1-byte encoding flag (0x01 = Gorilla, // 0x00 = uncompressed) followed by the payload. Gorilla is used when // the column has more than two non-null values and every DoD fits in @@ -385,9 +379,13 @@ func (e *qwpEncoder) encodeGeohashColumn(col *qwpColumnBuffer) { precision := col.geohashPrecision if precision <= 0 { // No precision established (column has only nulls). - // Write precision 0, no per-row data needed beyond - // the null bitmap (already written). - e.wb.putVarint(0) + // The server validates precision against [1, 60] + // (QwpGeoHashColumnCursor.of) even for all-null + // columns and rejects the whole message otherwise, so + // emit the minimum valid precision. valueCount() is 0 + // here, so no per-row data follows. Mirrors the Java + // client's QwpColumnWriter.writeGeoHashColumn clamp. + e.wb.putVarint(1) return } diff --git a/qwp_encoder_test.go b/qwp_encoder_test.go index c3bed1a5..40fc1025 100644 --- a/qwp_encoder_test.go +++ b/qwp_encoder_test.go @@ -50,7 +50,7 @@ func TestQwpEncoderFixedWidthGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Build expected bytes manually. var expected []byte @@ -80,10 +80,6 @@ func TestQwpEncoderFixedWidthGoldenBytes(t *testing.T) { expected = append(expected, 0x02) // ColCount = 2: varint(2) expected = append(expected, 0x02) - // SchemaMode = FULL (0x00) - expected = append(expected, 0x00) - // SchemaId = 0 (varint) - expected = append(expected, 0x00) // Column "a": name varint(1) + 'a', type LONG (0x05) expected = append(expected, 0x01, 0x61, 0x05) // Column "b": name varint(1) + 'b', type DOUBLE (0x07) @@ -120,7 +116,7 @@ func TestQwpEncoderHeader(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Verify header fields. if len(msg) < qwpHeaderSize { @@ -157,47 +153,6 @@ func TestQwpEncoderHeader(t *testing.T) { } } -func TestQwpEncoderSchemaReference(t *testing.T) { - tb := newQwpTableBuffer("t") - col, _ := tb.getOrCreateColumn("a", qwpTypeLong, false) - col.addLong(10) - tb.commitRow() - - const schemaId = 7 - - var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeReference, schemaId) - - // Parse past header (12) + empty delta dict (2) + table name "t" (2) + rowCount (1) + colCount (1). - off := 12 + 2 + 2 + 1 + 1 - - // Schema mode should be 0x01 (reference). - if msg[off] != byte(qwpSchemaModeReference) { - t.Fatalf("schemaMode = 0x%02X, want 0x%02X", msg[off], qwpSchemaModeReference) - } - off++ - - // Schema id: varint (single byte for small IDs). - gotId, n, err := qwpReadVarint(msg[off:]) - if err != nil { - t.Fatalf("failed to parse schemaId varint: %v", err) - } - if int(gotId) != schemaId { - t.Fatalf("schemaId = %d, want %d", gotId, schemaId) - } - off += n - - // Column data: null bitmap flag (0x00) + 1 × int64 LE = 10. - if msg[off] != 0x00 { - t.Fatalf("null bitmap flag = 0x%02X, want 0x00", msg[off]) - } - off++ - gotVal := int64(binary.LittleEndian.Uint64(msg[off : off+8])) - if gotVal != 10 { - t.Fatalf("column value = %d, want 10", gotVal) - } -} - func TestQwpEncoderAllFixedTypes(t *testing.T) { tb := newQwpTableBuffer("types") @@ -240,7 +195,7 @@ func TestQwpEncoderAllFixedTypes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Verify basic structure: message should be valid and non-empty. if len(msg) < qwpHeaderSize { @@ -281,14 +236,6 @@ func TestQwpEncoderAllFixedTypes(t *testing.T) { t.Fatalf("colCount = %d, want 12", colCount) } - // Schema mode - if msg[off] != 0x00 { - t.Fatalf("schemaMode = 0x%02X, want 0x00", msg[off]) - } - off++ - // Schema id varint (0 = 1 byte). - off++ - // Skip schema definitions (12 columns). for i := 0; i < 12; i++ { nLen, n, _ := qwpReadVarint(msg[off:]) @@ -429,7 +376,7 @@ func TestQwpEncoderNullableColumn(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Parse to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary @@ -438,10 +385,6 @@ func TestQwpEncoderNullableColumn(t *testing.T) { off += 2 // rowCount=3, colCount=1 off += 1 + 1 - // schemaMode=FULL - off++ - // schemaId varint (0 = 1 byte) - off++ // Column "v": varint(1) + 'v' + typeCode (LONG = 0x05, no nullable flag) off += 2 if msg[off] != 0x05 { @@ -501,26 +444,24 @@ func TestQwpEncoderMultipleColumns(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Expected payload: // empty delta symbol dict (deltaStart=0, deltaCount=0): 2 bytes // tableName "multi": 1 + 5 = 6 bytes // rowCount=2: 1 byte // colCount=3: 1 byte - // schemaMode: 1 byte - // schemaId varint(0): 1 byte // 3 columns × (varint(1) + name(1) + type(1)) = 9 bytes // 3 columns × (1 flag byte + 2 rows × 4 bytes) = 3 × 9 = 27 bytes - // Total payload = 2 + 6 + 1 + 1 + 1 + 1 + 9 + 27 = 48 - // Total message = 12 + 48 = 60 + // Total payload = 2 + 6 + 1 + 1 + 9 + 27 = 46 + // Total message = 12 + 46 = 58 payloadLen := binary.LittleEndian.Uint32(msg[8:12]) - if payloadLen != 48 { - t.Fatalf("payloadLength = %d, want 48", payloadLen) + if payloadLen != 46 { + t.Fatalf("payloadLength = %d, want 46", payloadLen) } - if len(msg) != 60 { - t.Fatalf("message length = %d, want 60", len(msg)) + if len(msg) != 58 { + t.Fatalf("message length = %d, want 58", len(msg)) } } @@ -542,7 +483,7 @@ func TestQwpEncoderEmptyTable(t *testing.T) { tb2.reset() var enc qwpEncoder - msg := enc.encodeTable(tb2, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb2) // Parse basic header. if len(msg) < qwpHeaderSize { @@ -579,7 +520,7 @@ func TestQwpEncoderReuse(t *testing.T) { col.addLong(1) tb1.commitRow() - msg1 := enc.encodeTable(tb1, qwpSchemaModeFull, 0) + msg1 := enc.encodeTable(tb1) msg1Copy := make([]byte, len(msg1)) copy(msg1Copy, msg1) @@ -588,7 +529,7 @@ func TestQwpEncoderReuse(t *testing.T) { col.addDouble(2.0) tb2.commitRow() - msg2 := enc.encodeTable(tb2, qwpSchemaModeFull, 0) + msg2 := enc.encodeTable(tb2) // msg1's backing buffer may have been reused, but msg1Copy is safe. // Verify msg2 encodes table "t2". @@ -621,7 +562,7 @@ func TestQwpEncoderDecimalSchema(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Parse to schema. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary @@ -629,10 +570,6 @@ func TestQwpEncoderDecimalSchema(t *testing.T) { off += 2 // rowCount=1, colCount=1 off += 1 + 1 - // schemaMode=FULL - off++ - // schemaId varint (0 = 1 byte) - off++ // Column "d": name varint(1) + 'd' = 2 bytes off += 2 @@ -677,16 +614,14 @@ func TestQwpEncoderBoolGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=3 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 4 + 1 // col "flag": varint(4) + "flag" + type + off += 2 // table name "t" + off += 1 // rowCount=3 + off += 1 // colCount=1 + off += 1 + 4 + 1 // col "flag": varint(4) + "flag" + type // Null bitmap flag (0x00) then bool data: 3 bits packed. off++ // null bitmap flag @@ -709,16 +644,14 @@ func TestQwpEncoderBoolNullableGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=3 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 4 + 1 // col "flag": varint(4) + "flag" + typeCode (BOOLEAN = 0x01) + off += 2 // table name "t" + off += 1 // rowCount=3 + off += 1 // colCount=1 + off += 1 + 4 + 1 // col "flag": varint(4) + "flag" + typeCode (BOOLEAN = 0x01) // Null bitmap flag: 0x01 (has nulls) if msg[off] != 0x01 { @@ -749,16 +682,14 @@ func TestQwpEncoderStringGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=2 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 1 + 1 // col "s": varint(1) + "s" + type + off += 2 // table name "t" + off += 1 // rowCount=2 + off += 1 // colCount=1 + off += 1 + 1 + 1 // col "s": varint(1) + "s" + type // Null bitmap flag (0x00) off++ @@ -793,16 +724,14 @@ func TestQwpEncoderSymbolGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=3 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 3 + 1 // col "sym": varint(3) + "sym" + type + off += 2 // table name "t" + off += 1 // rowCount=3 + off += 1 // colCount=1 + off += 1 + 3 + 1 // col "sym": varint(3) + "sym" + type // Null bitmap flag (0x00) off++ @@ -833,16 +762,14 @@ func TestQwpEncoderArrayGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=1 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 3 + 1 // col "arr": varint(3) + "arr" + type + off += 2 // table name "t" + off += 1 // rowCount=1 + off += 1 // colCount=1 + off += 1 + 3 + 1 // col "arr": varint(3) + "arr" + type // Null bitmap flag (0x00) off++ @@ -882,16 +809,14 @@ func TestQwpEncoderVarcharGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=1 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 1 + 1 // col "v": varint(1) + "v" + type (0x0F) + off += 2 // table name "t" + off += 1 // rowCount=1 + off += 1 // colCount=1 + off += 1 + 1 + 1 // col "v": varint(1) + "v" + type (0x0F) // Null bitmap flag (0x00) off++ @@ -930,7 +855,7 @@ func TestQwpEncoderDeltaDictGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTableWithDeltaDict(tb, globalDict, 0, 2, qwpSchemaModeFull, 0) + msg := enc.encodeTableWithDeltaDict(tb, globalDict, 0, 2) // Verify header. magic := binary.LittleEndian.Uint32(msg[0:4]) @@ -1014,14 +939,6 @@ func TestQwpEncoderDeltaDictGoldenBytes(t *testing.T) { t.Fatalf("colCount = %d, want 1", colCount) } - // schemaMode = FULL - if msg[off] != 0x00 { - t.Fatalf("schemaMode = 0x%02X, want 0x00", msg[off]) - } - off++ - // schemaId varint (0 = 1 byte) - off++ - // Column "sym": name + type (SYMBOL = 0x09) symNameLen, n, _ := qwpReadVarint(msg[off:]) off += n @@ -1065,7 +982,7 @@ func TestQwpEncoderDeltaDictEmptyDelta(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTableWithDeltaDict(tb, globalDict, 2, 2, qwpSchemaModeFull, 0) + msg := enc.encodeTableWithDeltaDict(tb, globalDict, 2, 2) // Flags: FLAG_DELTA_SYMBOL_DICT | FLAG_GORILLA. wantFlags := qwpFlagDeltaSymbolDict | qwpFlagGorilla @@ -1108,7 +1025,7 @@ func TestQwpEncoderDeltaDictAllNew(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 1, qwpSchemaModeFull, 0) + msg := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 1) off := qwpHeaderSize @@ -1144,45 +1061,6 @@ func TestQwpEncoderDeltaDictAllNew(t *testing.T) { } } -func TestQwpEncoderDeltaDictWithSchemaRef(t *testing.T) { - // Delta dict + schema reference mode. - globalDict := []string{"A"} - - tb := newQwpTableBuffer("t") - col, _ := tb.getOrCreateColumn("s", qwpTypeSymbol, false) - col.addSymbolID(0) - tb.commitRow() - - const schemaId = 11 - - var enc qwpEncoder - msg := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 0, qwpSchemaModeReference, schemaId) - - off := qwpHeaderSize - - // Skip delta dict: deltaStart=0, deltaCount=1, "A" - off += 1 + 1 + 1 + 1 // varint(0) + varint(1) + varint(1) + 'A' - - // Skip table name "t" - off += 1 + 1 - // rowCount=1, colCount=1 - off += 1 + 1 - // schemaMode = REFERENCE (0x01) - if msg[off] != 0x01 { - t.Fatalf("schemaMode = 0x%02X, want 0x01", msg[off]) - } - off++ - - // Schema id: varint. - gotId, _, err := qwpReadVarint(msg[off:]) - if err != nil { - t.Fatalf("parse schemaId: %v", err) - } - if int(gotId) != schemaId { - t.Fatalf("schemaId = %d, want %d", gotId, schemaId) - } -} - // --- Geohash encoder tests --- func TestQwpEncoderGeohashGoldenBytes(t *testing.T) { @@ -1206,16 +1084,14 @@ func TestQwpEncoderGeohashGoldenBytes(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=3 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 3 + 1 // col "geo": varint(3) + "geo" + type (0x0E) + off += 2 // table name "t" + off += 1 // rowCount=3 + off += 1 // colCount=1 + off += 1 + 3 + 1 // col "geo": varint(3) + "geo" + type (0x0E) // Null bitmap flag (0x00) off++ @@ -1271,16 +1147,14 @@ func TestQwpEncoderGeohashNullable(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=3 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 3 + 1 // col "geo": varint(3) + "geo" + type (0x0E, no nullable flag) + off += 2 // table name "t" + off += 1 // rowCount=3 + off += 1 // colCount=1 + off += 1 + 3 + 1 // col "geo": varint(3) + "geo" + type (0x0E, no nullable flag) // Null bitmap flag: 0x01 (has nulls) if msg[off] != 0x01 { @@ -1320,6 +1194,53 @@ func TestQwpEncoderGeohashNullable(t *testing.T) { } } +func TestQwpEncoderGeohashAllNull(t *testing.T) { + // All-null geohash column: precision was never established. + // The encoder must still emit a precision in [1, 60] — the + // server validates it (QwpGeoHashColumnCursor.of) and rejects + // the whole message on 0. Mirrors the Java client clamp. + tb := newQwpTableBuffer("t") + col, _ := tb.getOrCreateColumn("g", qwpTypeGeohash, true) + col.addNull() + tb.commitRow() + col, _ = tb.getOrCreateColumn("g", qwpTypeGeohash, true) + col.addNull() + tb.commitRow() + + var enc qwpEncoder + msg := enc.encodeTable(tb) + + // Skip to column data. + off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary + off += 2 // table name "t" + off += 1 // rowCount=2 + off += 1 // colCount=1 + off += 1 + 1 + 1 // col "g": varint(1) + "g" + type + + // Null bitmap flag: 0x01 (has nulls). + if msg[off] != 0x01 { + t.Fatalf("null bitmap flag = 0x%02X, want 0x01", msg[off]) + } + off++ + + // Null bitmap: rows 0 and 1 null → bits 0,1 → 0x03. + if msg[off] != 0x03 { + t.Fatalf("null bitmap = 0x%02X, want 0x03", msg[off]) + } + off++ + + // Precision varint: must be 1 (minimum valid), never 0. + if msg[off] != 0x01 { + t.Fatalf("precision varint = 0x%02X, want 0x01 (server rejects 0)", msg[off]) + } + off++ + + // No value data: valueCount == 0 for an all-null column. + if off != len(msg) { + t.Fatalf("unconsumed bytes: off=%d, len=%d", off, len(msg)) + } +} + func TestQwpEncoderGeohashPrecision8(t *testing.T) { // Precision=8 bits → exactly 1 byte per row. tb := newQwpTableBuffer("t") @@ -1330,16 +1251,14 @@ func TestQwpEncoderGeohashPrecision8(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=1 - off += 1 // colCount=1 - off += 1 // schemaMode=FULL - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 1 + 1 // col "g": varint(1) + "g" + type + off += 2 // table name "t" + off += 1 // rowCount=1 + off += 1 // colCount=1 + off += 1 + 1 + 1 // col "g": varint(1) + "g" + type // Null bitmap flag (0x00) off++ @@ -1371,16 +1290,14 @@ func TestQwpEncoderGeohashPrecision60(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Skip to column data. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary - off += 2 // table name "t" - off += 1 // rowCount=1 - off += 1 // colCount=1 - off += 1 // schemaMode - off += 1 // schemaId varint (0 = 1 byte) - off += 1 + 1 + 1 // col "g" + off += 2 // table name "t" + off += 1 // rowCount=1 + off += 1 // colCount=1 + off += 1 + 1 + 1 // col "g" // Null bitmap flag (0x00) off++ @@ -1411,7 +1328,7 @@ func TestQwpEncoderGorillaFlag(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) flags := msg[qwpHeaderOffsetFlags] if flags&qwpFlagGorilla == 0 { @@ -1430,7 +1347,7 @@ func TestQwpEncoderGorillaFlag(t *testing.T) { globalDict := []string{"sym0"} var enc qwpEncoder - msg := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 0, qwpSchemaModeFull, 0) + msg := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 0) flags := msg[qwpHeaderOffsetFlags] if flags&qwpFlagGorilla == 0 { @@ -1454,15 +1371,13 @@ func TestQwpEncoderTimestampEncodingPrefix(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Parse to column data section. off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary off += 2 // table name "t" (varint 1 + 't') off++ // rowCount=1 off++ // colCount=1 - off++ // schemaMode=FULL - off++ // schemaId varint (0 = 1 byte) off += 4 // column "ts": varint(2) + "ts" + typeCode TIMESTAMP (0x0A) if msg[off] != 0x00 { @@ -1499,14 +1414,12 @@ func TestQwpEncoderTimestampGorillaPath(t *testing.T) { } var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) off := qwpHeaderSize + 2 // empty delta dict off += 2 // table name "t" off++ // rowCount=5 off++ // colCount=1 - off++ // schemaMode=FULL - off++ // schemaId=0 off += 4 // column "ts" + type TIMESTAMP off++ // null bitmap flag (0x00 no nulls) @@ -1534,14 +1447,12 @@ func TestQwpEncoderTimestampGorillaOverflowFallback(t *testing.T) { } var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) off := qwpHeaderSize + 2 off += 2 // table name off++ // rowCount off++ // colCount - off++ // schemaMode - off++ // schemaId off += 4 // column "ts" + type off++ // null bitmap flag @@ -1573,7 +1484,7 @@ func TestQwpEncoderGorillaDisabled(t *testing.T) { var enc qwpEncoder enc.gorillaDisabled = true - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) flags := msg[qwpHeaderOffsetFlags] if flags&qwpFlagGorilla != 0 { @@ -1587,8 +1498,6 @@ func TestQwpEncoderGorillaDisabled(t *testing.T) { off += 2 // table name "t" off++ // rowCount=5 off++ // colCount=1 - off++ // schemaMode=FULL - off++ // schemaId=0 off += 4 // column "ts" + type TIMESTAMP off++ // null bitmap flag (0x00 no nulls) @@ -1623,11 +1532,11 @@ func TestQwpEncoderMultiTable(t *testing.T) { col.addString("hello") tb3.commitRow() - tables := []qwpTableEncodeInfo{ - {tb: tb1, schemaMode: qwpSchemaModeFull, schemaId: 0}, - {tb: tb2, schemaMode: qwpSchemaModeFull, schemaId: 1}, - {tb: tb3, schemaMode: qwpSchemaModeReference, schemaId: 2}, - } + // The multi-table production path writes every table block with its + // full inline schema (no mode byte, no schema_id) — matching the + // c-questdb-client live path. The test verifies all three tables + // carry their inline column definitions. + tables := []*qwpTableBuffer{tb1, tb2, tb3} globalDict := []string{"sym0"} var enc qwpEncoder @@ -1690,11 +1599,6 @@ func TestQwpEncoderMultiTable(t *testing.T) { if colCount != 1 { t.Fatalf("table 1 colCount = %d, want 1", colCount) } - if msg[off] != byte(qwpSchemaModeFull) { - t.Fatalf("table 1 schemaMode = 0x%02X, want FULL", msg[off]) - } - off++ - off++ // schemaId varint (0 = 1 byte) // Skip full schema: col "x" (varint(1) + 'x' + 0x05) slen, n, _ := qwpReadVarint(msg[off:]) off += n + int(slen) + 1 @@ -1710,16 +1614,11 @@ func TestQwpEncoderMultiTable(t *testing.T) { off += int(nameLen) off++ // rowCount=1 off++ // colCount=1 - if msg[off] != byte(qwpSchemaModeFull) { - t.Fatalf("table 2 schemaMode = 0x%02X, want FULL", msg[off]) - } - off++ - off++ // schemaId varint (1 = 1 byte) slen, n, _ = qwpReadVarint(msg[off:]) off += n + int(slen) + 1 // col "y" + type off += 1 + 8 // null flag + double - // Parse table 3: "gamma" with STRING column, REFERENCE schema + // Parse table 3: "gamma" with STRING column. nameLen, n, _ = qwpReadVarint(msg[off:]) off += n if string(msg[off:off+int(nameLen)]) != "gamma" { @@ -1728,12 +1627,10 @@ func TestQwpEncoderMultiTable(t *testing.T) { off += int(nameLen) off++ // rowCount=1 off++ // colCount=1 - if msg[off] != byte(qwpSchemaModeReference) { - t.Fatalf("table 3 schemaMode = 0x%02X, want REFERENCE", msg[off]) - } - off++ - off++ // schemaId varint (2 = 1 byte) - off++ // null flag + // Full schema: col "z" + type byte + slen, n, _ = qwpReadVarint(msg[off:]) + off += n + int(slen) + 1 + off++ // null flag // String column: (rowCount+1) uint32 offsets + data // 2 offsets = 8 bytes + "hello" = 5 bytes off += 8 + 5 @@ -1756,7 +1653,7 @@ func TestQwpEncoderMultiTable(t *testing.T) { // verification of column encoding. func extractColumnData(tb *qwpTableBuffer) []byte { var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) off := qwpHeaderSize + 2 // +2 for empty delta symbol dictionary // Skip table name (varint string). @@ -1768,10 +1665,6 @@ func extractColumnData(tb *qwpTableBuffer) []byte { // Skip colCount varint. _, n, _ = qwpReadVarint(msg[off:]) off += n - // Skip schemaMode (1 byte = FULL). - off++ - // Skip schemaId varint (0 = 1 byte). - off++ // Skip schema: for each column, varint string + 1 byte type code. for i := 0; i < len(tb.columns); i++ { sLen, sn, _ := qwpReadVarint(msg[off:]) diff --git a/qwp_error_api_conf_test.go b/qwp_error_api_conf_test.go new file mode 100644 index 00000000..6d1e446e --- /dev/null +++ b/qwp_error_api_conf_test.go @@ -0,0 +1,193 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb_test + +import ( + "strings" + "testing" + + qdb "github.com/questdb/go-questdb-client/v4" +) + +// TestErrorApiConfStringHappyPath parses each new connect-string key +// and asserts it lands on the right slot. +func TestErrorApiConfStringHappyPath(t *testing.T) { + cases := []struct { + conf string + wantGlobal qdb.Policy + wantSchema qdb.Policy + wantParse qdb.Policy + }{ + { + conf: "ws::addr=h:9000;on_server_error=halt;", + wantGlobal: qdb.PolicyHalt, + }, + { + conf: "ws::addr=h:9000;on_server_error=drop;", + wantGlobal: qdb.PolicyDropAndContinue, + }, + { + conf: "ws::addr=h:9000;on_server_error=auto;", + wantGlobal: qdb.PolicyAuto, + }, + { + conf: "ws::addr=h:9000;on_schema_error=halt;", + wantSchema: qdb.PolicyHalt, + }, + { + conf: "ws::addr=h:9000;on_parse_error=drop;", + wantParse: qdb.PolicyDropAndContinue, + }, + { + conf: "ws::addr=h:9000;on_internal_error=halt;on_security_error=drop;on_write_error=halt;", + wantGlobal: qdb.PolicyAuto, + }, + { + conf: "ws::addr=h:9000;error_inbox_capacity=64;", + wantGlobal: qdb.PolicyAuto, + }, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + _, err := qdb.ConfFromStr(tc.conf) + if err != nil { + t.Fatalf("ConfFromStr(%q) = %v, want nil", tc.conf, err) + } + }) + } +} + +// TestErrorApiConfStringInvalidValues asserts each new key rejects +// nonsense values with NewInvalidConfigStrError. +func TestErrorApiConfStringInvalidValues(t *testing.T) { + cases := []struct { + conf string + want string + }{ + {"ws::addr=h:9000;on_server_error=foo;", "on_server_error"}, + {"ws::addr=h:9000;on_schema_error=auto;", "on_schema_error"}, + {"ws::addr=h:9000;on_parse_error=foo;", "on_parse_error"}, + {"ws::addr=h:9000;on_internal_error=banana;", "on_internal_error"}, + {"ws::addr=h:9000;on_security_error=;", "on_security_error"}, + {"ws::addr=h:9000;on_write_error=halts;", "on_write_error"}, + {"ws::addr=h:9000;error_inbox_capacity=-1;", "error_inbox_capacity"}, + {"ws::addr=h:9000;error_inbox_capacity=0;", "error_inbox_capacity"}, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + _, err := qdb.ConfFromStr(tc.conf) + if err == nil { + t.Fatalf("ConfFromStr(%q) should fail", tc.conf) + } + if !strings.Contains(err.Error(), tc.want) { + t.Fatalf("error = %v, want to contain %q", err, tc.want) + } + }) + } +} + +// TestErrorApiConfStringQwpOnly asserts each new key is rejected for +// HTTP and TCP transports. +func TestErrorApiConfStringQwpOnly(t *testing.T) { + keys := []string{ + "on_server_error=halt", + "on_schema_error=halt", + "on_parse_error=halt", + "on_internal_error=halt", + "on_security_error=halt", + "on_write_error=halt", + "error_inbox_capacity=32", + } + prefixes := []string{"http", "tcp"} + for _, prefix := range prefixes { + for _, k := range keys { + conf := prefix + "::addr=h:9000;" + k + ";" + t.Run(conf, func(t *testing.T) { + _, err := qdb.ConfFromStr(conf) + if err == nil { + t.Fatalf("%s should reject %s", prefix, k) + } + if !strings.Contains(err.Error(), "QWP") { + t.Fatalf("error = %v, want to mention QWP", err) + } + }) + } + } +} + +// TestErrorApiSanitizerRejectsTinyInbox asserts the sanitizer rejects +// error_inbox_capacity values below the spec floor of 16. +func TestErrorApiSanitizerRejectsTinyInbox(t *testing.T) { + cases := []struct { + conf string + want string + }{ + {"ws::addr=h:9000;error_inbox_capacity=1;", ">="}, + {"ws::addr=h:9000;error_inbox_capacity=15;", ">="}, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + _, err := qdb.ConfFromStr(tc.conf) + if err == nil { + t.Fatalf("ConfFromStr(%q) should fail", tc.conf) + } + if !strings.Contains(err.Error(), tc.want) { + t.Fatalf("error = %v, want to contain %q", err, tc.want) + } + }) + } +} + +// TestErrorApiSanitizerAcceptsAtFloor asserts capacity=16 passes. +func TestErrorApiSanitizerAcceptsAtFloor(t *testing.T) { + if _, err := qdb.ConfFromStr("ws::addr=h:9000;error_inbox_capacity=16;"); err != nil { + t.Fatalf("capacity=16 should pass, got %v", err) + } +} + +// TestErrorApiWithErrorPolicyAutoClearsPerCatSet asserts that a +// non-Auto override followed by PolicyAuto on the same category +// nets out to "no per-category override set", so the HTTP/TCP +// sanitizers do not falsely reject the build as a QWP-only API use. +func TestErrorApiWithErrorPolicyAutoClearsPerCatSet(t *testing.T) { + cases := []struct { + name string + st qdb.SenderType + }{ + {"http", qdb.HttpSenderType}, + {"tcp", qdb.TcpSenderType}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + conf := qdb.NewLineSenderConfig(tc.st) + qdb.WithAddress("h:9000")(conf) + qdb.WithErrorPolicy(qdb.CategorySchemaMismatch, qdb.PolicyHalt)(conf) + qdb.WithErrorPolicy(qdb.CategorySchemaMismatch, qdb.PolicyAuto)(conf) + if err := qdb.SanitizeConf(conf); err != nil { + t.Fatalf("sanitizer should not reject net-Auto per-cat override, got %v", err) + } + }) + } +} diff --git a/qwp_error_api_integration_test.go b/qwp_error_api_integration_test.go new file mode 100644 index 00000000..68d319c0 --- /dev/null +++ b/qwp_error_api_integration_test.go @@ -0,0 +1,277 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestErrorApiPerCategory drives every wire status byte through the +// receiver loop and asserts the resulting Category and Policy. +func TestErrorApiPerCategory(t *testing.T) { + cases := []struct { + name string + status QwpStatusCode + wantCat Category + wantPolicy Policy + dropPath bool // true if Policy == DropAndContinue (no terminal error) + }{ + {"SchemaMismatch", QwpStatusSchemaMismatch, CategorySchemaMismatch, PolicyDropAndContinue, true}, + {"ParseError", QwpStatusParseError, CategoryParseError, PolicyHalt, false}, + {"InternalError", QwpStatusInternalError, CategoryInternalError, PolicyHalt, false}, + {"SecurityError", QwpStatusSecurityError, CategorySecurityError, PolicyHalt, false}, + {"WriteError", QwpStatusWriteError, CategoryWriteError, PolicyDropAndContinue, true}, + {"Unknown(0xFE)", QwpStatusCode(0xFE), CategoryUnknown, PolicyHalt, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + opts := qwpSfTestServerOpts{rejectStatus: tc.status} + if tc.dropPath { + // Reject the first frame only; subsequent frames OK. + // Otherwise the loop would Drop forever and we'd never + // observe a clean continuation. + opts.rejectFirstNFrames = 1 + } + srv := newQwpSfTestServer(t, opts) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + gotCh := make(chan *SenderError, 4) + loop.sendLoopSetErrorHandler(func(e *SenderError) { + select { + case gotCh <- e: + default: + } + }, qwpSfMinErrorInboxCapacity) + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + select { + case got := <-gotCh: + assert.Equal(t, tc.wantCat, got.Category, "Category mismatch") + assert.Equal(t, tc.wantPolicy, got.AppliedPolicy, "Policy mismatch") + case <-time.After(3 * time.Second): + t.Fatal("handler not invoked within deadline") + } + + if tc.dropPath { + // Drop: ackedFsn advances past the rejected span; + // LastTerminalError stays nil. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 0 + }, 2*time.Second, 1*time.Millisecond) + assert.Nil(t, s.LastTerminalError(), "Drop should not latch terminal") + } else { + // Halt: terminal latched; LastTerminalError non-nil. + require.Eventually(t, func() bool { + return s.LastTerminalError() != nil + }, 2*time.Second, 1*time.Millisecond) + se := s.LastTerminalError() + require.NotNil(t, se) + assert.Equal(t, tc.wantCat, se.Category) + } + }) + } +} + +// TestErrorApiOverridePolicyViaResolver registers a programmatic +// resolver that flips PARSE_ERROR (default Halt) to Drop, and asserts +// the loop drops + continues instead of latching. +func TestErrorApiOverridePolicyViaResolver(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusParseError, + rejectFirstNFrames: 1, + }) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + loop.sendLoopSetPolicyResolver(&qwpSfPolicyResolver{ + resolver: func(c Category) Policy { + if c == CategoryParseError { + return PolicyDropAndContinue + } + return PolicyAuto + }, + }) + + // Two frames: first rejected and dropped, second OK. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + require.NoError(t, s.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 1 + }, 5*time.Second, 1*time.Millisecond, + "ackedFsn should advance past the dropped frame") + assert.Nil(t, s.LastTerminalError(), + "resolver flipped Halt to Drop; no terminal error expected") +} + +// TestErrorApiOverridePolicyViaPerCategory uses the perCat slot to +// flip SCHEMA_MISMATCH (default Drop) to Halt — mirrors the +// connect-string on_schema_error=halt path. +func TestErrorApiOverridePolicyViaPerCategory(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + }) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + r := &qwpSfPolicyResolver{} + r.perCat[CategorySchemaMismatch] = PolicyHalt + loop.sendLoopSetPolicyResolver(r) + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return s.LastTerminalError() != nil + }, 2*time.Second, 1*time.Millisecond, + "Halt override should latch terminal") + se := s.LastTerminalError() + require.NotNil(t, se) + assert.Equal(t, CategorySchemaMismatch, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) +} + +// TestErrorApiFsnSpanCorrelation drives a HALT rejection and asserts +// the [FromFsn, ToFsn] span on the SenderError matches the engine's +// publishedFsn at the time the rejection was classified. Useful as a +// sanity check that producer-side FSN and SenderError FSN line up. +func TestErrorApiFsnSpanCorrelation(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusParseError, + }) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + // Flush may return either nil (rejection not yet classified) or + // the typed *SenderError (if the receiver beat us to it). Either + // is fine for FSN correlation — we only need the engine's view + // of the published FSN. + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + se := s.LastTerminalError() + require.NotNil(t, se) + // The rejected frame's FSN must equal the engine's publishedFsn: + // only one frame was sent, and the receiver saw it. + assert.Equal(t, engine.enginePublishedFsn(), se.FromFsn, + "FromFsn should equal publishedFsn for a single-frame batch") + assert.Equal(t, se.FromFsn, se.ToFsn, + "single-frame span: FromFsn == ToFsn") +} + +// The HALT-vs-concurrent-Flush contract ("every Flush after the latch +// surfaces the typed *SenderError; never 'callback fired but Flush +// passed'") is pinned by TestErrorApiResilience_HaltVsConcurrentFlushStress +// in qwp_error_resilience_test.go, which asserts all-of-N (every +// hammering goroutine observes the error) after confirming the latch — +// the quiescent state the LineSender contract actually guarantees. + +// TestErrorApiHaltLatchedBeforeHandlerInvoked pins the ordering +// invariant called out in qwp-cursor-error-api.md §120: on a HALT +// rejection, the I/O loop must set the lastError / +// lastTerminalServerError latch BEFORE handing the SenderError to the +// dispatcher. Otherwise a handler that synchronously probes the +// terminal state races the latch and may observe "no error" even +// though the sender just halted. +// +// The test registers a handler that probes sendLoopCheckError() and +// sendLoopLastTerminalServerError() — both are atomic-pointer reads, +// so they're safe to call from the dispatcher goroutine while the +// producer is parked. Over many iterations the handler must NEVER +// see either probe return nil. The previous offer-before-latch +// ordering would fail this assertion intermittently. +func TestErrorApiHaltLatchedBeforeHandlerInvoked(t *testing.T) { + if testing.Short() { + t.Skip("race test skipped in short mode") + } + const iters = 200 + for i := 0; i < iters; i++ { + runHaltLatchedBeforeHandlerOnce(t, i) + } +} + +func runHaltLatchedBeforeHandlerOnce(t *testing.T, iter int) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + type handlerObservation struct { + checkErr error + terminal *SenderError + } + gotCh := make(chan handlerObservation, 1) + loop.sendLoopSetErrorHandler(func(e *SenderError) { + // Read-only probes: atomic pointer loads, no race against + // the producer. With correct ordering, both must reflect + // the terminal state by the time we get here. + obs := handlerObservation{ + checkErr: loop.sendLoopCheckError(), + terminal: loop.sendLoopLastTerminalServerError(), + } + select { + case gotCh <- obs: + default: + } + }, qwpSfMinErrorInboxCapacity) + + require.NoError(t, s.Table("t").Int64Column("v", int64(iter)).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + select { + case obs := <-gotCh: + require.NotNil(t, obs.checkErr, + "iter %d: sendLoopCheckError() must be non-nil inside handler "+ + "(latch must be set BEFORE dispatch)", iter) + require.NotNil(t, obs.terminal, + "iter %d: lastTerminalServerError must be non-nil inside handler "+ + "(latch must be set BEFORE dispatch)", iter) + case <-time.After(3 * time.Second): + t.Fatalf("iter %d: handler not invoked within deadline", iter) + } +} diff --git a/qwp_error_resilience_test.go b/qwp_error_resilience_test.go new file mode 100644 index 00000000..8522ecc5 --- /dev/null +++ b/qwp_error_resilience_test.go @@ -0,0 +1,1130 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +// This file holds error-resilience tests that go beyond the unit-style +// classification / dispatcher / payload tests in +// qwp_sender_error_api_test.go and qwp_error_api_integration_test.go. +// +// Coverage focus: +// - Public-API end-to-end: every WithError* builder option and every +// on_*_error connect-string key is exercised through +// LineSenderFromConf / NewLineSender, so a wiring bug between +// conf.* and the running send loop's resolver/dispatcher is caught. +// - Reconnect × error: rejections that surface after a reconnect +// boundary, with FSN-span correlation against post-reconnect +// fsnAtZero. +// - SF disk × error: HALT survives close + reopen on the same slot +// (matches the spec's "no resumeAfterHalt; close + rebuild = +// recovery"); DROP-acked frames are unlinked and don't replay. +// - Strict per-category payload assertions: every field of +// *SenderError is checked (not just Category + Policy). +// - Concurrent halt-vs-flush stress: many iterations, no pre-check, +// all hammering goroutines must observe the typed error. +// - Dispatcher swap mid-flight: the atomic.Pointer guarantee that a +// concurrent WithErrorHandler swap doesn't lose the old handler's +// in-flight notifications below the dropped-counter line. + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// addrOf strips the http:// prefix from an httptest.Server URL so the +// result is suitable as the addr= value in a QWP connect string. +func addrOf(srv *qwpSfTestServer) string { + return strings.TrimPrefix(srv.URL, "http://") +} + +// asQwp type-asserts to QwpSender (the superset interface that exposes +// LastTerminalError, TotalServerErrors, etc). Every QWP sender does +// implement this — the assertion is purely to surface the extra +// methods on the LineSender returned by LineSenderFromConf. +func asQwp(t *testing.T, ls LineSender) QwpSender { + t.Helper() + qs, ok := ls.(QwpSender) + require.True(t, ok, "LineSender did not implement QwpSender: %T", ls) + return qs +} + +// ============================================================================= +// Public-API end-to-end: builder options +// ============================================================================= + +// TestErrorApiBuilderOption_WithErrorHandlerInvoked drives a HALT +// rejection through a sender built via NewLineSender + WithQwp + +// WithErrorHandler, and asserts the user-supplied handler is invoked. +// Closes a gap that the unit tests previously left wide open: there +// was no test that the public option actually wired the handler into +// the running dispatcher. +func TestErrorApiBuilderOption_WithErrorHandlerInvoked(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + gotCh := make(chan *SenderError, 4) + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addrOf(srv)), + WithErrorHandler(func(e *SenderError) { gotCh <- e }), + WithErrorInboxCapacity(qwpSfMinErrorInboxCapacity), + ) + require.NoError(t, err) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = ls.Flush(context.Background()) // expected to surface the rejection + + select { + case got := <-gotCh: + assert.Equal(t, CategoryParseError, got.Category) + assert.Equal(t, PolicyHalt, got.AppliedPolicy) + case <-time.After(3 * time.Second): + t.Fatal("user-supplied error handler was not invoked") + } +} + +// TestErrorApiBuilderOption_WithErrorPolicyOverride uses +// WithErrorPolicy(SchemaMismatch, Halt) to flip the spec default +// (Drop) to Halt, and asserts the next Flush surfaces *SenderError. +func TestErrorApiBuilderOption_WithErrorPolicyOverride(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusSchemaMismatch}) + defer srv.Close() + + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addrOf(srv)), + WithErrorPolicy(CategorySchemaMismatch, PolicyHalt), + ) + require.NoError(t, err) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + // Drive the rejection. The first Flush may race the receiver; the + // second Flush is guaranteed to surface the latched terminal + // error if the override took effect. + _ = ls.Flush(context.Background()) + require.Eventually(t, func() bool { + return asQwp(t, ls).LastTerminalError() != nil + }, 3*time.Second, 1*time.Millisecond, + "override SchemaMismatch=Halt should latch, but LastTerminalError stayed nil") + + // AtNow surfaces the latched terminal error now that Table() + // polls the I/O loop's HALT latch on entry. + err = ls.Table("t").Int64Column("v", 2).AtNow(context.Background()) + require.Error(t, err) + var se *SenderError + require.True(t, errors.As(err, &se)) + assert.Equal(t, CategorySchemaMismatch, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) +} + +// TestErrorApiBuilderOption_WithErrorPolicyResolver registers a +// programmatic resolver that flips PARSE_ERROR (default Halt) to +// Drop, and asserts the loop drops + continues past the rejection +// instead of latching. +func TestErrorApiBuilderOption_WithErrorPolicyResolver(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusParseError, + rejectFirstNFrames: 1, + }) + defer srv.Close() + + gotCh := make(chan *SenderError, 4) + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addrOf(srv)), + WithErrorPolicyResolver(func(c Category) Policy { + if c == CategoryParseError { + return PolicyDropAndContinue + } + return PolicyAuto + }), + WithErrorHandler(func(e *SenderError) { gotCh <- e }), + WithErrorInboxCapacity(qwpSfMinErrorInboxCapacity), + ) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + // Two flushes: first rejected and dropped, second OK. + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background())) + require.NoError(t, ls.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background())) + + select { + case got := <-gotCh: + assert.Equal(t, CategoryParseError, got.Category) + assert.Equal(t, PolicyDropAndContinue, got.AppliedPolicy, + "resolver should have flipped Halt → Drop") + case <-time.After(3 * time.Second): + t.Fatal("handler not invoked: resolver may not have wired through") + } + assert.Nil(t, qs.LastTerminalError(), + "resolver flipped Halt→Drop; no terminal error expected") +} + +// TestErrorApiBuilderOption_WithErrorInboxCapacity sets a small +// capacity and floods a slow handler, asserting the drop counter +// rises (i.e., the option actually sized the inbox). +func TestErrorApiBuilderOption_WithErrorInboxCapacity(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, // Drop policy → no halt + }) + defer srv.Close() + + release := make(chan struct{}) + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addrOf(srv)), + WithErrorHandler(func(e *SenderError) { <-release }), + WithErrorInboxCapacity(qwpSfMinErrorInboxCapacity), + ) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { + close(release) + _ = ls.Close(context.Background()) + }() + + for i := 0; i < 200; i++ { + require.NoError(t, ls.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background())) + } + require.Eventually(t, func() bool { + return qs.DroppedErrorNotifications() > 0 + }, 5*time.Second, 10*time.Millisecond, + "DroppedErrorNotifications never increased: dropped=%d delivered=%d", + qs.DroppedErrorNotifications(), qs.TotalErrorNotificationsDelivered()) +} + +// TestErrorApiBuilderOption_ProtocolViolationOverrideIgnored asserts +// that WithErrorPolicy(ProtocolViolation, DropAndContinue) is +// silently ignored — ProtocolViolation is forced HALT regardless. +// The forced behavior protects users who would otherwise lose +// connection-gone errors; matching the spec contract documented on +// the Policy enum. +func TestErrorApiBuilderOption_ProtocolViolationOverrideIgnored(t *testing.T) { + srv := closeFrameTestServer(t, websocket.StatusProtocolError, "bad framing") + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addr), + // Try to flip ProtocolViolation to Drop. Should be ignored. + WithErrorPolicy(CategoryProtocolViolation, PolicyDropAndContinue), + ) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = ls.Flush(context.Background()) + require.Eventually(t, func() bool { + return qs.LastTerminalError() != nil + }, 3*time.Second, 1*time.Millisecond, + "ProtocolViolation must HALT regardless of user override") + se := qs.LastTerminalError() + require.NotNil(t, se) + assert.Equal(t, CategoryProtocolViolation, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy, + "forced HALT for ProtocolViolation should ignore user override") +} + +// ============================================================================= +// Public-API end-to-end: connect-string keys +// ============================================================================= + +// TestErrorApiConfString_OnParseErrorDrop builds a sender from a +// connect string with on_parse_error=drop and asserts the loop +// continues past PARSE_ERROR rejections instead of latching. End-to- +// end test of the conf-string → resolver wiring path. +func TestErrorApiConfString_OnParseErrorDrop(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusParseError, + rejectFirstNFrames: 1, + }) + defer srv.Close() + + conf := "ws::addr=" + addrOf(srv) + ";on_parse_error=drop;" + ls, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background())) + require.NoError(t, ls.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background()), + "second Flush should succeed because on_parse_error=drop continued past the rejection") + // Flush no longer blocks on the server ACK (cursor path, commit + // 29a6f12), so the PARSE_ERROR rejection is processed by the send + // loop asynchronously. Wait for the counter to reflect it before + // asserting; checking the no-latch invariant only afterwards makes + // it meaningful (the rejection is known to have been handled). + require.Eventually(t, func() bool { + return qs.TotalServerErrors() >= 1 + }, 3*time.Second, 1*time.Millisecond, + "the rejection must still bump the server-error counter") + assert.Nil(t, qs.LastTerminalError(), + "on_parse_error=drop must not latch terminal") +} + +// TestErrorApiConfString_OnSchemaErrorHalt builds a sender from a +// connect string with on_schema_error=halt and asserts that a +// SchemaMismatch (Drop by default) instead halts. End-to-end test of +// the conf-string → resolver wiring path going the other direction +// (default-Drop flipped to Halt). +func TestErrorApiConfString_OnSchemaErrorHalt(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusSchemaMismatch}) + defer srv.Close() + + conf := "ws::addr=" + addrOf(srv) + ";on_schema_error=halt;" + ls, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = ls.Flush(context.Background()) + require.Eventually(t, func() bool { + return qs.LastTerminalError() != nil + }, 3*time.Second, 1*time.Millisecond, + "on_schema_error=halt should latch the SchemaMismatch as terminal") + assert.Equal(t, CategorySchemaMismatch, qs.LastTerminalError().Category) +} + +// TestErrorApiConfString_OnServerErrorHaltGlobal sets the global +// override on_server_error=halt and asserts a SchemaMismatch (default +// Drop) latches as terminal — the global override takes effect since +// no per-category override is set. +func TestErrorApiConfString_OnServerErrorHaltGlobal(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusSchemaMismatch}) + defer srv.Close() + + conf := "ws::addr=" + addrOf(srv) + ";on_server_error=halt;" + ls, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = ls.Flush(context.Background()) + require.Eventually(t, func() bool { + return qs.LastTerminalError() != nil + }, 3*time.Second, 1*time.Millisecond) + assert.Equal(t, PolicyHalt, qs.LastTerminalError().AppliedPolicy) +} + +// TestErrorApiConfString_PerCategoryBeatsGlobal asserts the +// precedence: per-category on_*_error overrides on_server_error. +func TestErrorApiConfString_PerCategoryBeatsGlobal(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 1, + }) + defer srv.Close() + + // Global=halt, per-category=drop. Per-category must win. + conf := "ws::addr=" + addrOf(srv) + ";on_server_error=halt;on_schema_error=drop;" + ls, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs := asQwp(t, ls) + defer func() { _ = ls.Close(context.Background()) }() + + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background())) + require.NoError(t, ls.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, ls.Flush(context.Background()), + "per-category drop must beat global halt") + assert.Nil(t, qs.LastTerminalError()) +} + +// ============================================================================= +// Reconnect × error interaction +// ============================================================================= + +// TestErrorApiResilience_ReconnectThenHaltFsnCorrelation drives a +// reconnect followed by a HALT, and asserts the SenderError's +// FromFsn matches the engine-side publishedFsn at rejection time — +// specifically, that fsnAtZero advanced correctly across the +// reconnect boundary so wireSeq=0 on the new connection maps to +// FSN >= 1 (the first frame ACK'd on connection 1). +func TestErrorApiResilience_ReconnectThenHaltFsnCorrelation(t *testing.T) { + // Connection 1: ACKs the first frame, then closes after reading + // frame 1 (without ACKing it). One ACK seen so the loop's + // silent-drop guard does not fire, and we get a clean reconnect. + // Connection 2: rejects everything with PARSE_ERROR. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + closeAfterFrames: 2, + rejectStatus: QwpStatusParseError, + rejectFromConn: 2, + }) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Frame 0: ACK'd by conn 1. + require.NoError(t, s.Table("t").Int64Column("v", 0).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + require.Eventually(t, func() bool { return engine.engineAckedFsn() >= 0 }, + 2*time.Second, 1*time.Millisecond, "frame 0 should be ACK'd on conn 1") + + // Frame 1: conn 1 reads it then closes (no ACK). The loop + // reconnects to conn 2, which rejects the replayed frame 1. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 5*time.Second, 1*time.Millisecond, "expected HALT after reconnect") + + se := s.LastTerminalError() + require.NotNil(t, se) + assert.Equal(t, CategoryParseError, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + // The rejected frame's FSN must be 1 — the second frame in the + // publish order. This is the entire point of FSN-correlation + // across reconnect: even though wireSeq on conn 2 starts at 0, + // fsnAtZero=1 maps it back to the right global FSN. + assert.Equal(t, int64(1), se.FromFsn, + "FromFsn must reflect post-reconnect fsnAtZero (=1), not raw wireSeq (=0)") + assert.Equal(t, se.FromFsn, se.ToFsn, + "single-frame rejection: FromFsn == ToFsn") + + // Reconnect actually happened. + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1)) +} + +// TestErrorApiResilience_DropAcrossReconnect: drop frame 0 on conn 1, +// reconnect, then drop frame 1 on conn 2. Assert ackedFsn advances +// to 1 (both drops counted as "resolved by server") and no terminal +// error is latched. +func TestErrorApiResilience_DropAcrossReconnect(t *testing.T) { + // Connection 1: drop frame 0 (rejectFirstNFrames=1), then close + // after reading frame 1 (closeAfterFrames=2). One ACK delivered, + // so the silent-drop guard does not fire and reconnect kicks in. + // Connection 2: rejectFromConn=2 means reject all frames on conn ≥ 2. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 1, + closeAfterFrames: 2, + rejectFromConn: 2, + }) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Frame 0: dropped on conn 1 (Drop policy → ackedFsn advances to 0). + require.NoError(t, s.Table("t").Int64Column("v", 0).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + require.Eventually(t, func() bool { return engine.engineAckedFsn() >= 0 }, + 2*time.Second, 1*time.Millisecond, "frame 0 must be drop-acked on conn 1") + + // Frame 1: conn 1 reads it then closes (no ACK). The loop reconnects + // and replays frame 1 on conn 2, which drops it (ackedFsn → 1). + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 1 + }, 5*time.Second, 1*time.Millisecond, + "engineAckedFsn = %d, expected >= 1 (frame 0 + frame 1 both dropped)", + engine.engineAckedFsn()) + assert.Nil(t, s.LastTerminalError(), + "Drop across reconnect should not latch terminal") + assert.GreaterOrEqual(t, loop.sendLoopTotalServerErrors(), int64(2), + "two drops should each bump the server-error counter") + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1), + "reconnect must have happened between the two drops") +} + +// TestErrorApiResilience_ReconnectThenAuthFailure exercises the +// auth-on-reconnect terminal: the live conn gets killed mid-stream, +// the reconnect factory points at an auth-rejecting server, and the +// loop must surface CategorySecurityError + PolicyHalt without +// retrying past the auth wall. +func TestErrorApiResilience_ReconnectThenAuthFailure(t *testing.T) { + authSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{upgradeStatus: 401}) + defer authSrv.Close() + dataSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer dataSrv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(dataSrv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialAt(authSrv.URL), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Warm up: get an OK ACK on dataSrv. + _, err = engine.engineAppendBlocking(context.Background(), []byte("warmup")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "dataSrv should have ACK'd the warm-up frame") + + // Tear the live WS so the loop falls into reconnect against authSrv. + close(dataSrv.kill) + + require.Eventually(t, func() bool { + return loop.sendLoopLastTerminalServerError() != nil + }, 2*time.Second, 1*time.Millisecond) + + se := loop.sendLoopLastTerminalServerError() + require.NotNil(t, se) + assert.Equal(t, CategorySecurityError, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + assert.Equal(t, NoStatusByte, se.ServerStatusByte, + "upgrade failures carry no QWP status byte") + assert.Equal(t, NoMessageSequence, se.MessageSequence) + assert.Contains(t, se.ServerMessage, "401") +} + +// ============================================================================= +// SF disk-mode × error interaction +// ============================================================================= + +// TestErrorApiResilience_SfDiskHaltCloseReopenReplays exercises the +// "close + rebuild" recovery path the spec mandates in lieu of +// resumeAfterHalt. Sender 1 hits a HALT-inducing rejection, closes +// (the unacked frame stays on disk under the slot), Sender 2 opens +// the same slot and replays the same frame — server rejects again, +// HALT latches again. This is the contract that makes "client +// restart" deterministic for HALT scenarios. +func TestErrorApiResilience_SfDiskHaltCloseReopenReplays(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + tmp := t.TempDir() + conf := strings.Join([]string{ + "ws::addr=" + addrOf(srv), + "sf_dir=" + tmp, + "sender_id=halt-replay", + "sf_max_bytes=4096", + "close_flush_timeout_millis=100;", // short — the loop will halt, not drain + }, ";") + + // === Sender 1: induce HALT, close. === + ls1, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs1 := asQwp(t, ls1) + + require.NoError(t, ls1.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = ls1.Flush(context.Background()) + require.Eventually(t, func() bool { + return qs1.LastTerminalError() != nil + }, 3*time.Second, 1*time.Millisecond, "sender 1 should HALT") + se1 := qs1.LastTerminalError() + require.NotNil(t, se1) + assert.Equal(t, CategoryParseError, se1.Category) + + // Close — drain will time out (HALT keeps ackedFsn behind + // publishedFsn), so Close returns the timeout error. We don't + // care about that, only that it returns. + _ = ls1.Close(context.Background()) + + // === Sender 2: open same slot, expect the unacked frame to + // replay and trigger a fresh HALT. === + ls2, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs2 := asQwp(t, ls2) + defer func() { _ = ls2.Close(context.Background()) }() + + require.Eventually(t, func() bool { + return qs2.LastTerminalError() != nil + }, 5*time.Second, 1*time.Millisecond, + "sender 2 should replay the on-disk frame and re-HALT against the same server") + se2 := qs2.LastTerminalError() + require.NotNil(t, se2) + assert.Equal(t, CategoryParseError, se2.Category, + "replayed rejection should classify the same way") + assert.Equal(t, PolicyHalt, se2.AppliedPolicy) +} + +// TestErrorApiResilience_SfDiskDropPersistsAckedAcrossRestart drives +// a Drop-policy rejection through SF disk mode, closes cleanly, then +// reopens the slot and asserts a NEW frame goes through normally — +// the dropped frame must NOT replay (it was acked-via-drop, so the +// segment file should be unlinked). This is the SF flip side of the +// HALT replay test: drops are durable, halts are durable, but the +// persistence semantics differ. +func TestErrorApiResilience_SfDiskDropPersistsAckedAcrossRestart(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, // default Drop + rejectFirstNFrames: 1, + }) + defer srv.Close() + + tmp := t.TempDir() + conf := strings.Join([]string{ + "ws::addr=" + addrOf(srv), + "sf_dir=" + tmp, + "sender_id=drop-restart", + "sf_max_bytes=4096", + "close_flush_timeout_millis=2000;", + }, ";") + + // === Sender 1: send frame 0 (rejected → dropped), close cleanly. === + ls1, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs1 := asQwp(t, ls1) + + require.NoError(t, ls1.Table("t").Int64Column("v", 0).AtNow(context.Background())) + require.NoError(t, ls1.Flush(context.Background())) + + // Wait for the drop to propagate so ackedFsn catches up to + // publishedFsn — only then does Close drain successfully. + require.Eventually(t, func() bool { + return qs1.AckedFsn() >= 0 + }, 2*time.Second, 1*time.Millisecond, + "frame 0 should be acked-via-drop on sender 1") + assert.Nil(t, qs1.LastTerminalError(), "Drop should not latch terminal") + // Clean close — drain should complete because everything's + // acked-via-drop. + require.NoError(t, ls1.Close(context.Background())) + + // Server frame counter saw the rejected frame. + frames1 := srv.totalFramesReceived.Load() + require.GreaterOrEqual(t, frames1, int64(1)) + + // === Sender 2: same slot, send a fresh frame. The dropped frame + // must NOT replay (would surface as a duplicate frame on the + // server side). === + ls2, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + qs2 := asQwp(t, ls2) + defer func() { _ = ls2.Close(context.Background()) }() + + require.NoError(t, ls2.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls2.Flush(context.Background())) + require.Eventually(t, func() bool { + return qs2.AckedFsn() >= 0 + }, 2*time.Second, 1*time.Millisecond) + + // Server should have seen exactly one additional frame on + // sender 2 — the new one — not a replay of the dropped frame. + frames2 := srv.totalFramesReceived.Load() + assert.Equal(t, frames1+1, frames2, + "sender 2 should send only the new frame; dropped frame should NOT replay") +} + +// ============================================================================= +// Strict per-category payload assertions +// ============================================================================= + +// TestErrorApiPerCategoryStrict extends TestErrorApiPerCategory with +// strict assertions on every field of *SenderError. Catches bugs +// like "ServerStatusByte set to the wrong byte" or "DetectedAt left +// at zero" that the loose Category+Policy check would miss. +func TestErrorApiPerCategoryStrict(t *testing.T) { + cases := []struct { + name string + status QwpStatusCode + wantCat Category + wantPolicy Policy + dropPath bool + }{ + {"SchemaMismatch", QwpStatusSchemaMismatch, CategorySchemaMismatch, PolicyDropAndContinue, true}, + {"ParseError", QwpStatusParseError, CategoryParseError, PolicyHalt, false}, + {"InternalError", QwpStatusInternalError, CategoryInternalError, PolicyHalt, false}, + {"SecurityError", QwpStatusSecurityError, CategorySecurityError, PolicyHalt, false}, + {"WriteError", QwpStatusWriteError, CategoryWriteError, PolicyDropAndContinue, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + opts := qwpSfTestServerOpts{rejectStatus: tc.status} + if tc.dropPath { + opts.rejectFirstNFrames = 1 + } + srv := newQwpSfTestServer(t, opts) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + gotCh := make(chan *SenderError, 4) + loop.sendLoopSetErrorHandler(func(e *SenderError) { + select { + case gotCh <- e: + default: + } + }, qwpSfMinErrorInboxCapacity) + + before := time.Now() + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + var got *SenderError + select { + case got = <-gotCh: + case <-time.After(3 * time.Second): + t.Fatal("handler not invoked within deadline") + } + after := time.Now() + + assert.Equal(t, tc.wantCat, got.Category, "Category") + assert.Equal(t, tc.wantPolicy, got.AppliedPolicy, "AppliedPolicy") + assert.Equal(t, int(tc.status), got.ServerStatusByte, "ServerStatusByte") + assert.Contains(t, got.ServerMessage, "rejected", "ServerMessage carries server text") + assert.Equal(t, int64(0), got.MessageSequence, + "single-frame batch starts at MessageSequence 0") + assert.Equal(t, int64(0), got.FromFsn, "single-frame batch FromFsn=0") + assert.Equal(t, got.FromFsn, got.ToFsn, "single-frame span") + assert.Equal(t, "", got.TableName, + "server doesn't attribute single-table batches yet (forward-compat)") + assert.False(t, got.DetectedAt.IsZero(), "DetectedAt populated") + assert.True(t, !got.DetectedAt.Before(before) && !got.DetectedAt.After(after), + "DetectedAt within [before, after] window: detected=%v before=%v after=%v", + got.DetectedAt, before, after) + + // Assert the Error() string contains the expected + // human-readable bits — the producer side relies on this + // when logging. + s2 := got.Error() + assert.Contains(t, s2, tc.wantCat.String()) + assert.Contains(t, s2, tc.wantPolicy.String()) + assert.Contains(t, s2, fmt.Sprintf("0x%02X", byte(tc.status))) + assert.Contains(t, s2, "rejected") + }) + } +} + +// TestErrorApiResilience_LastTerminalErrorSurvivesClose latches a HALT, +// closes the sender, and asserts LastTerminalError still returns the +// snapshot afterward. Useful for diagnostics that want to inspect +// the error after Close() has returned. +func TestErrorApiResilience_LastTerminalErrorSurvivesClose(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusInternalError}) + defer srv.Close() + + s, _, loop, _ := newCursorSenderForTest(t, srv, 0) + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + beforeClose := s.LastTerminalError() + require.NotNil(t, beforeClose) + + _ = s.Close(context.Background()) + + afterClose := s.LastTerminalError() + require.NotNil(t, afterClose, "LastTerminalError should still return the snapshot after Close") + assert.Equal(t, beforeClose, afterClose, + "LastTerminalError snapshot must not change across Close") +} + +// TestErrorApiResilience_TotalServerErrorsCounterStrict drives 3 +// drop-policy rejections back-to-back and asserts the counter is +// exactly 3 (not >=3, exactly). Catches off-by-one and +// double-counting bugs that the looser >= assertions in the existing +// suite would miss. +func TestErrorApiResilience_TotalServerErrorsCounterStrict(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 3, + }) + defer srv.Close() + + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + for i := 0; i < 3; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + } + // Send a 4th frame that should NOT be rejected — bookmarks the + // fact that the 3 prior rejections settled. + require.NoError(t, s.Table("t").Int64Column("v", 99).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + + require.Eventually(t, func() bool { + return s.AckedFsn() >= 3 + }, 5*time.Second, 1*time.Millisecond, "all four frames should be acked") + + assert.Equal(t, int64(3), s.TotalServerErrors(), + "exactly three drops should have happened, not more, not fewer") + assert.Nil(t, s.LastTerminalError(), "Drops should not latch terminal") +} + +// ============================================================================= +// Concurrent halt-vs-flush stress +// ============================================================================= + +// TestErrorApiResilience_HaltVsConcurrentFlushStress pins the +// HALT-is-terminal contract under load: many iterations and a strict +// "every hammering goroutine must observe *SenderError" assertion. A +// weaker any-of-N assertion can hide a race where only one goroutine +// observes the latched state. Hammering happens AFTER the latch is +// confirmed, so the sender is quiescent (no concurrent producer) — +// matches the LineSender contract that production code must +// serialize calls. +func TestErrorApiResilience_HaltVsConcurrentFlushStress(t *testing.T) { + if testing.Short() { + t.Skip("stress test skipped in short mode") + } + const iters = 500 + const goroutines = 8 + for i := 0; i < iters; i++ { + runHaltStressOnce(t, i, goroutines) + } +} + +func runHaltStressOnce(t *testing.T, iter, goroutines int) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Single producer Flush triggers the rejection. The server + // rejects every frame with PARSE_ERROR, so one Flush is enough + // to latch HALT. + require.NoError(t, s.Table("t").Int64Column("v", int64(iter)).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + // Wait for the latch to be observable. After this, the sender + // is quiescent (no concurrent producer) and Flush from many + // goroutines is safe — each just samples the latched error. + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, time.Microsecond, "iter %d: loop must latch", iter) + + // Hammer Flush from N goroutines. Every Flush MUST surface the + // typed *SenderError. + var hammerWg sync.WaitGroup + var observed atomic.Int32 + for j := 0; j < goroutines; j++ { + hammerWg.Add(1) + go func() { + defer hammerWg.Done() + err := s.Flush(context.Background()) + if err == nil { + return + } + var se *SenderError + if errors.As(err, &se) && se.Category == CategoryParseError { + observed.Add(1) + } + }() + } + hammerWg.Wait() + + assert.Equal(t, int32(goroutines), observed.Load(), + "iter %d: every hammering goroutine must observe *SenderError, got %d/%d", + iter, observed.Load(), goroutines) +} + +// ============================================================================= +// Dispatcher mid-flight swap +// ============================================================================= + +// TestErrorApiResilience_DispatcherSwapMidFlight: enqueue errors +// against a slow handler, then swap the handler via +// sendLoopSetErrorHandler. The atomic.Pointer machinery should make +// this race-free: the swap is observed by the next offer; the old +// dispatcher's drain delivers any remaining queued items (subject to +// its drain timeout) before exiting. Asserts that +// - the new handler receives notifications offered after the swap; +// - the counters (TotalErrorNotificationsDelivered and +// DroppedErrorNotifications) sum consistently with TotalServerErrors. +func TestErrorApiResilience_DispatcherSwapMidFlight(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 50, // 50 drops on conn 1 + }) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // First handler: counts deliveries. + var oldDelivered atomic.Int64 + loop.sendLoopSetErrorHandler(func(e *SenderError) { + oldDelivered.Add(1) + }, qwpSfMinErrorInboxCapacity) + + // Drive 25 rejections, then swap the handler. + for i := 0; i < 25; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + } + require.Eventually(t, func() bool { + return s.TotalServerErrors() >= 25 + }, 5*time.Second, 1*time.Millisecond) + + // Swap to a new handler. + var newDelivered atomic.Int64 + loop.sendLoopSetErrorHandler(func(e *SenderError) { + newDelivered.Add(1) + }, qwpSfMinErrorInboxCapacity) + + // Drive 25 more rejections — these must reach the new handler. + for i := 25; i < 50; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + } + require.Eventually(t, func() bool { + return s.TotalServerErrors() >= 50 + }, 5*time.Second, 1*time.Millisecond) + + // Wait briefly for the new dispatcher to drain. + require.Eventually(t, func() bool { + return newDelivered.Load() > 0 + }, 2*time.Second, 1*time.Millisecond, + "new handler must receive at least some notifications after swap") + + // Old + new together should account for at most TotalServerErrors. + // The strict bound is harder because (a) the old dispatcher's + // drain may discard items still in its inbox at swap time, and + // (b) some notifications may end up in DroppedErrorNotifications + // if the inboxes filled up. Sanity bound: deliveries <= server + // errors observed. + totalDelivered := oldDelivered.Load() + newDelivered.Load() + totalErrors := s.TotalServerErrors() + dropped := s.DroppedErrorNotifications() + assert.LessOrEqual(t, totalDelivered, totalErrors, + "deliveries (%d) must not exceed total server errors (%d)", + totalDelivered, totalErrors) + assert.Equal(t, totalErrors, totalDelivered+dropped+0 /* lost-to-old-drain unaccounted */, + "every server error should be either delivered or dropped (or lost to old-dispatcher drain)") + + // The new handler should have received SOMETHING (otherwise the + // swap didn't take effect). + assert.Greater(t, newDelivered.Load(), int64(0), + "new handler received zero deliveries — swap did not take effect") +} + +// ============================================================================= +// Server restart simulation +// ============================================================================= + +// TestErrorApiResilience_ServerRestartReplaysCorrectly models a full +// server restart: the first transport dial lands on srv1; srv1 ACKs +// frame 0 then closes after reading frame 1; the next dial (i.e. the +// reconnect) lands on srv2 — a fresh server with zero state about +// the client's prior frames. Replay must succeed and frames 1, 2 +// must arrive at srv2. This is the canonical "server restart" +// scenario the SF design targets. +func TestErrorApiResilience_ServerRestartReplaysCorrectly(t *testing.T) { + // srv1 ACKs frame 0 (closeAfterFrames=2: ACK seq 0, then on the + // 2nd frame returns without ACK). + srv1 := newQwpSfTestServer(t, qwpSfTestServerOpts{closeAfterFrames: 2}) + defer srv1.Close() + srv2 := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv2.Close() + + // Factory returns srv1 on the first call, srv2 thereafter. + // Models "the old server died; a new one is now responsible for + // the address" — fresh state on the server side, but the client + // re-replays its on-disk tail. + var attempt atomic.Int32 + factory := func(ctx context.Context, _ int) (*qwpTransport, error) { + var t qwpTransport + var url string + if attempt.Add(1) == 1 { + url = srv1.URL + } else { + url = srv2.URL + } + wsURL := "ws" + strings.TrimPrefix(url, "http") + if err := t.connect(ctx, wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { + return nil, err + } + return &t, nil + } + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := factory(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, factory, + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Push 3 frames. srv1 ACKs frame 0; srv1 closes on reading frame + // 1. Loop reconnects, factory returns srv2 transport. srv2 sees + // frames 1 and 2 (replays of unacked tail). + for i := 0; i < 3; i++ { + _, err := engine.engineAppendBlocking(context.Background(), + []byte(fmt.Sprintf("f%d", i))) + require.NoError(t, err) + } + + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 2 + }, 10*time.Second, 1*time.Millisecond, + "after server restart, all frames should be ACK'd (acked=%d)", + engine.engineAckedFsn()) + + // srv1 only saw frames 0 and 1 (ACK'd 0, dropped before ACKing 1). + // srv2 must have seen frames 1 and 2 — the unacked tail replayed. + assert.GreaterOrEqual(t, srv2.totalFramesReceived.Load(), int64(2), + "server 2 should have received the replayed unacked tail (got %d)", + srv2.totalFramesReceived.Load()) + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1), + "reconnect must have happened across the server restart") + assert.Nil(t, loop.sendLoopLastTerminalServerError(), + "server restart with healthy new server should not produce a terminal error") +} + +// ============================================================================= +// Drain timeout boundary +// ============================================================================= + +// TestErrorApiResilience_DispatcherDrainTimeoutCap verifies that +// closing a sender with many queued errors + slow handler completes +// within a bounded time (the dispatcher's drain timeout caps the +// wait). Without this cap, a malicious or buggy handler could stall +// shutdown indefinitely. The cap is currently 100 ms; the test +// asserts < 1 s for headroom. +func TestErrorApiResilience_DispatcherDrainTimeoutCap(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 100, + }) + defer srv.Close() + + s, _, loop, _ := newCursorSenderForTest(t, srv, 0) + + // Slow handler: each call takes 50 ms. With 100 queued items, + // processing them all would take 5 s; the drain timeout (100 ms) + // must cap that. + loop.sendLoopSetErrorHandler(func(e *SenderError) { + time.Sleep(50 * time.Millisecond) + }, 256) // generous capacity so most drops queue rather than getting dropped + + // Drive 100 drops as fast as possible. + for i := 0; i < 100; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + } + require.Eventually(t, func() bool { + return s.TotalServerErrors() >= 100 + }, 5*time.Second, 1*time.Millisecond) + + // Now close — the drain timeout must fire before the slow + // handler chews through all 100 queued items. + start := time.Now() + _ = s.Close(context.Background()) + elapsed := time.Since(start) + + // Allow generous headroom but assert we're not blocked for the + // full 5 s the slow handler would otherwise need. + assert.Less(t, elapsed, 2*time.Second, + "close should not wait for a slow handler past the drain timeout") +} + +// ============================================================================= +// HALT after partial Drop streak +// ============================================================================= + +// TestErrorApiResilience_DropStreakThenHalt models a realistic +// scenario: many rows fail with WriteError (Drop policy), the loop +// keeps draining, then a row hits ParseError (Halt policy) and the +// loop latches. The Drop counter and Halt latch should be +// independent; the FSN on the Halt should be > the FSNs of the +// Drops. +func TestErrorApiResilience_DropStreakThenHalt(t *testing.T) { + // Custom server: WriteError for first 3 frames, ParseError on + // frame 4. Switch by adding a custom handler — the existing + // fixture only supports one rejectStatus per server. + var nFrames atomic.Int32 + srv := &qwpSfTestServer{kill: make(chan struct{})} + srv.Server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + var localSeq int64 + for { + _, _, err := conn.Read(context.Background()) + if err != nil { + return + } + n := nFrames.Add(1) + srv.totalFramesReceived.Add(1) + var status QwpStatusCode + if n <= 3 { + status = QwpStatusWriteError // Drop + } else { + status = QwpStatusParseError // Halt + } + _ = conn.Write(context.Background(), websocket.MessageBinary, + buildAckError(status, localSeq, "rejected")) + localSeq++ + } + })) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + for i := 0; i < 4; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + _ = s.Flush(context.Background()) + } + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 5*time.Second, 1*time.Millisecond) + + se := s.LastTerminalError() + require.NotNil(t, se) + assert.Equal(t, CategoryParseError, se.Category, "last terminal should be the Halt, not a Drop") + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + assert.Equal(t, int64(3), se.FromFsn, "the Halted frame is FSN 3 (after 3 Drops at 0..2)") + + // 4 server errors total: 3 drops + 1 halt. + assert.Equal(t, int64(4), s.TotalServerErrors()) +} diff --git a/qwp_errors.go b/qwp_errors.go index 47ba4d64..cb173d89 100644 --- a/qwp_errors.go +++ b/qwp_errors.go @@ -24,67 +24,143 @@ package questdb -import "fmt" +import ( + "fmt" + "strings" + "time" +) + +// QwpUpgradeRejectError is returned by qwpTransport.connect when the +// server completes the HTTP exchange with a non-101 status. Construction +// captures the response status and the failover-relevant headers so the +// reconnect loop can classify the host without re-parsing strings: +// +// - StatusCode is the HTTP response status (e.g. 421 for a misdirected +// request). +// - Role is the trimmed X-QuestDB-Role header value (empty if absent). +// The spec admits STANDALONE / PRIMARY / REPLICA / PRIMARY_CATCHUP; +// unrecognised tokens are surfaced verbatim and classified by the +// reconnect loop. +// - Zone is the trimmed X-QuestDB-Zone header value (empty if absent). +// Used to record host zone tier ahead of any successful upgrade. +// - RetryAfter is the parsed Retry-After header in seconds (0 if absent +// or unparseable). Hint only — the failover loop's outage budget +// still bounds the wait. +// - Body is up to qwpUpgradeBodySnippetCap bytes of the response body, +// captured for error formatting. Truncation is signalled by a +// trailing "…" in the Error() output. +type QwpUpgradeRejectError struct { + StatusCode int + Role string + Zone string + RetryAfter time.Duration + Body string + // cause is the underlying websocket.Dial error. connect builds this + // type only on a dial failure, so it is non-nil in practice. It is + // the real reason the upgrade failed when StatusCode is 101: the + // HTTP exchange reached the handshake-complete status but the + // WebSocket upgrade itself was rejected (e.g. a bad + // Sec-WebSocket-Accept). Exposed via Unwrap. + cause error +} + +// qwpUpgradeBodySnippetCap bounds how many response-body bytes the +// transport captures into QwpUpgradeRejectError.Body. Keeps error +// messages bounded when a misconfigured server returns a large HTML +// payload on a 4xx/5xx upgrade rejection. +const qwpUpgradeBodySnippetCap = 512 + +// Error implements the error interface. The format leads with the +// HTTP status and tag (Role / Zone / Retry-After) so the failover +// loop can include the message verbatim in its budget-exhaustion +// report without losing the structured fields. +func (e *QwpUpgradeRejectError) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "qwp: upgrade rejected with HTTP %d", e.StatusCode) + if e.Role != "" { + fmt.Fprintf(&b, " (role=%s)", e.Role) + } + if e.Zone != "" { + fmt.Fprintf(&b, " (zone=%s)", e.Zone) + } + if e.RetryAfter > 0 { + fmt.Fprintf(&b, " (retry-after=%s)", e.RetryAfter) + } + if e.Body != "" { + fmt.Fprintf(&b, ": %s", e.Body) + } + // A 101 status means the HTTP handshake completed but the WebSocket + // upgrade was still rejected, so "rejected with HTTP 101" is + // misleading on its own — surface the underlying dial error that + // actually explains the failure. + if e.StatusCode == 101 && e.cause != nil { + fmt.Fprintf(&b, ": %v", e.cause) + } + return b.String() +} + +// Unwrap returns the underlying websocket.Dial error so errors.Is / +// errors.As can reach the transport-level cause. Classification keys +// off StatusCode via a top-level type assertion, so unwrapping does +// not affect host-role classification. +func (e *QwpUpgradeRejectError) Unwrap() error { + return e.cause +} + +// IsRoleReject reports whether the upgrade was rejected with the +// failover-spec "topology hint" combination: HTTP 421 plus a non-empty +// X-QuestDB-Role header. The reconnect loop classifies the host as +// TransientReject (Role == PRIMARY_CATCHUP, case-insensitive) or +// TopologyReject (any other non-empty role). +func (e *QwpUpgradeRejectError) IsRoleReject() bool { + return e.StatusCode == 421 && e.Role != "" +} + +// IsCatchupRole reports whether the role tag is PRIMARY_CATCHUP +// (case-insensitive). Only meaningful when IsRoleReject() is true. +func (e *QwpUpgradeRejectError) IsCatchupRole() bool { + return strings.EqualFold(e.Role, "PRIMARY_CATCHUP") +} // qwpStatusName returns a human-readable name for a QWP status code. -func qwpStatusName(status qwpStatusCode) string { +// Used by (*SenderError).Error() to format the wire-byte component of +// rejection messages. +func qwpStatusName(status QwpStatusCode) string { switch status { - case qwpStatusOK: + case QwpStatusOK: return "OK" - case qwpStatusSchemaMismatch: + case QwpStatusDurableAck: + return "DURABLE_ACK" + case QwpStatusSchemaMismatch: return "SCHEMA_MISMATCH" - case qwpStatusParseError: + case QwpStatusParseError: return "PARSE_ERROR" - case qwpStatusInternalError: + case QwpStatusInternalError: return "INTERNAL_ERROR" - case qwpStatusSecurityError: + case QwpStatusSecurityError: return "SECURITY_ERROR" - case qwpStatusWriteError: + case QwpStatusWriteError: return "WRITE_ERROR" + case qwpStatusCancelled: + return "CANCELLED" + case qwpStatusLimitExceeded: + return "LIMIT_EXCEEDED" default: return fmt.Sprintf("UNKNOWN(%d)", status) } } -// QwpError represents an error returned by the QuestDB server in -// a QWP ACK response. It contains the status code, the -// sequence number from the response, and an optional error message. -type QwpError struct { - // Status is the status code from the ACK response. - Status qwpStatusCode - - // Sequence is the cumulative sequence number from the ACK, used - // to correlate responses with requests in async mode. - Sequence int64 - - // Message is the server's error description, or empty if - // no error message was included in the response. - Message string -} - -// Error implements the error interface. -func (e *QwpError) Error() string { - name := qwpStatusName(e.Status) - if e.Message != "" { - return fmt.Sprintf("qwp: server error %s (0x%02X): %s", name, byte(e.Status), e.Message) - } - return fmt.Sprintf("qwp: server error %s (0x%02X)", name, byte(e.Status)) -} - -// newQwpErrorFromAck creates a QwpError from a raw ACK payload. -// Returns nil if the status is OK. +// parseAckErrorPayload extracts the status code, cumulative sequence +// number, and server error message from a non-OK ACK frame. Used by +// the SF send loop's receiver to assemble a *SenderError with the +// surrounding FSN-span context. // // Precondition: data has already been validated by readAck, which -// guarantees qwpAckOKSize bytes for OK status and at least -// qwpAckErrorHeaderSize + msg_len bytes for non-OK statuses. -func newQwpErrorFromAck(data []byte) *QwpError { - status := qwpStatusCode(data[0]) - if status == qwpStatusOK { - return nil - } - return &QwpError{ - Status: status, - Sequence: parseAckSequence(data), - Message: parseAckError(data), +// guarantees the layout invariants documented on readAck. +func parseAckErrorPayload(data []byte) (status QwpStatusCode, seq int64, msg string) { + status = QwpStatusCode(data[0]) + if status == QwpStatusOK || status == QwpStatusDurableAck { + return status, 0, "" } + return status, parseAckSequence(data), parseAckError(data) } diff --git a/qwp_errors_test.go b/qwp_errors_test.go deleted file mode 100644 index f62546c5..00000000 --- a/qwp_errors_test.go +++ /dev/null @@ -1,141 +0,0 @@ -/*+***************************************************************************** - * ___ _ ____ ____ - * / _ \ _ _ ___ ___| |_| _ \| __ ) - * | | | | | | |/ _ \/ __| __| | | | _ \ - * | |_| | |_| | __/\__ \ |_| |_| | |_) | - * \__\_\\__,_|\___||___/\__|____/|____/ - * - * Copyright (c) 2014-2019 Appsicle - * Copyright (c) 2019-2026 QuestDB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ******************************************************************************/ - -package questdb - -import ( - "encoding/binary" - "strings" - "testing" -) - -func TestQwpErrorInterface(t *testing.T) { - e := &QwpError{ - Status: qwpStatusParseError, - Sequence: 42, - Message: "bad column type", - } - - // Verify it implements error interface. - var err error = e - s := err.Error() - if !strings.Contains(s, "PARSE_ERROR") { - t.Fatalf("error string should contain PARSE_ERROR, got: %s", s) - } - if !strings.Contains(s, "bad column type") { - t.Fatalf("error string should contain message, got: %s", s) - } - if !strings.Contains(s, "0x05") { - t.Fatalf("error string should contain hex status, got: %s", s) - } -} - -func TestQwpErrorNoMessage(t *testing.T) { - e := &QwpError{ - Status: qwpStatusWriteError, - Sequence: 1, - } - s := e.Error() - if !strings.Contains(s, "WRITE_ERROR") { - t.Fatalf("error string should contain WRITE_ERROR, got: %s", s) - } -} - -func TestQwpStatusName(t *testing.T) { - tests := []struct { - status qwpStatusCode - want string - }{ - {qwpStatusOK, "OK"}, - {qwpStatusSchemaMismatch, "SCHEMA_MISMATCH"}, - {qwpStatusParseError, "PARSE_ERROR"}, - {qwpStatusInternalError, "INTERNAL_ERROR"}, - {qwpStatusSecurityError, "SECURITY_ERROR"}, - {qwpStatusWriteError, "WRITE_ERROR"}, - {qwpStatusCode(42), "UNKNOWN(42)"}, - } - for _, tc := range tests { - got := qwpStatusName(tc.status) - if got != tc.want { - t.Fatalf("qwpStatusName(0x%02X) = %q, want %q", - byte(tc.status), got, tc.want) - } - } -} - -func TestNewQwpErrorFromAck(t *testing.T) { - t.Run("OK", func(t *testing.T) { - data := make([]byte, 9) - data[0] = byte(qwpStatusOK) - err := newQwpErrorFromAck(data) - if err != nil { - t.Fatalf("expected nil for OK status, got: %v", err) - } - }) - - t.Run("ParseError", func(t *testing.T) { - errMsg := "invalid column" - data := make([]byte, 11+len(errMsg)) - data[0] = byte(qwpStatusParseError) - binary.LittleEndian.PutUint64(data[1:9], 7) - binary.LittleEndian.PutUint16(data[9:11], uint16(len(errMsg))) - copy(data[11:], errMsg) - - e := newQwpErrorFromAck(data) - if e == nil { - t.Fatal("expected error, got nil") - } - if e.Status != qwpStatusParseError { - t.Fatalf("status = %d, want %d", e.Status, qwpStatusParseError) - } - if e.Sequence != 7 { - t.Fatalf("sequence = %d, want 7", e.Sequence) - } - if e.Message != errMsg { - t.Fatalf("message = %q, want %q", e.Message, errMsg) - } - }) - - t.Run("WriteErrorNoMessage", func(t *testing.T) { - // 11 bytes: status + sequence + msg_len(0), no trailing message. - data := make([]byte, 11) - data[0] = byte(qwpStatusWriteError) - binary.LittleEndian.PutUint64(data[1:9], 99) - - e := newQwpErrorFromAck(data) - if e == nil { - t.Fatal("expected error, got nil") - } - if e.Status != qwpStatusWriteError { - t.Fatalf("status = %d, want %d", e.Status, qwpStatusWriteError) - } - if e.Sequence != 99 { - t.Fatalf("sequence = %d, want 99", e.Sequence) - } - if e.Message != "" { - t.Fatalf("message = %q, want empty", e.Message) - } - }) - -} diff --git a/qwp_failover_test.go b/qwp_failover_test.go new file mode 100644 index 00000000..7bcfa933 --- /dev/null +++ b/qwp_failover_test.go @@ -0,0 +1,1579 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "encoding/binary" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/coder/websocket" +) + +// mockClusterNode is one entry in a multi-server failover test +// fixture. Each node has its own httptest.Server and tags itself with +// a role / nodeId / clusterId that flow into the SERVER_INFO frame +// it emits to incoming clients. +type mockClusterNode struct { + t *testing.T + // srv is the underlying httptest.Server. + srv *httptest.Server + // role is the SERVER_INFO.role byte. PRIMARY / REPLICA / etc. + role byte + // nodeId / clusterId are echoed in SERVER_INFO. nodeId is unique + // per node so observeConnectedIdx can match the binding back to + // the node. + nodeId string + clusterId string + + // alive gates whether the server accepts new connections. + alive atomic.Bool + // onConnectCount counts successful upgrades for diagnostics. + onConnectCount atomic.Int64 + // suppressServerInfo, when true, completes the WebSocket upgrade + // but never writes the SERVER_INFO frame, so the client's + // SERVER_INFO read times out at serverInfoTimeout. Used by tests + // that need a slow but reachable endpoint. + suppressServerInfo atomic.Bool +} + +// addr returns the host:port for connection-string assembly. +func (n *mockClusterNode) addr() string { + return strings.TrimPrefix(n.srv.URL, "http://") +} + +// mockCluster aggregates N httptest.Server fakes — one per simulated +// QuestDB node. Use newMockCluster to build the cluster, then access +// nodes[i] to drive selective failure / role assertions. +type mockCluster struct { + t *testing.T + nodes []*mockClusterNode +} + +// addrList joins the node host:port pairs for use in the addr= conf +// string or WithQwpQueryEndpoints option. Honours the order passed to +// newMockCluster so target-filter tests can assert which node bound. +func (c *mockCluster) addrList() string { + parts := make([]string, 0, len(c.nodes)) + for _, n := range c.nodes { + parts = append(parts, n.addr()) + } + return strings.Join(parts, ",") +} + +// newMockCluster spins up n in-process WebSocket servers, each tagged +// with a role / nodeId / clusterId provided by tag(). The returned +// cluster is automatically torn down via t.Cleanup; tests can also +// kill individual nodes mid-test via node.kill(). +// +// Each node's handler is responsible for the post-SERVER_INFO +// choreography. Nil handler defaults to "send a QUERY_ERROR(internal) +// to every QUERY_REQUEST" — useful for transport-failure simulations +// that don't otherwise produce events. +func newMockCluster(t *testing.T, n int, tag func(idx int) (role byte, nodeId, clusterId string), handler func(idx int, m *qwpMockEgressConn)) *mockCluster { + t.Helper() + cluster := &mockCluster{t: t, nodes: make([]*mockClusterNode, 0, n)} + for i := 0; i < n; i++ { + role, nodeId, clusterId := tag(i) + mn := &mockClusterNode{ + t: t, + role: role, + nodeId: nodeId, + clusterId: clusterId, + } + mn.alive.Store(true) + idx := i + mn.srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !mn.alive.Load() { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + w.Header().Set(qwpHeaderVersion, fmt.Sprintf("%d", qwpVersion)) + conn, err := websocket.Accept(w, r, nil) + if err != nil { + t.Logf("mock node %d: accept: %v", idx, err) + return + } + defer conn.CloseNow() + mn.onConnectCount.Add(1) + if mn.suppressServerInfo.Load() { + // Hold the upgraded connection open without writing + // SERVER_INFO so the client's read times out. + <-r.Context().Done() + return + } + frame := buildServerInfoFrame(qwpVersion, 0, + mn.role, uint64(idx+1), 0, time.Now().UnixNano(), + mn.clusterId, mn.nodeId) + if err := conn.Write(r.Context(), websocket.MessageBinary, frame); err != nil { + t.Logf("mock node %d: SERVER_INFO write: %v", idx, err) + return + } + // Stamp the negotiated version on every frame the mock + // writes — the cluster advertises qwpVersion in + // X-QWP-Version (see above), and the decoder's + // strict-equality version check rejects frames whose header + // version byte does not match the negotiated version. + mc := &qwpMockEgressConn{t: t, conn: conn, version: qwpVersion} + if handler != nil { + handler(idx, mc) + } else { + // Default: stay alive until the connection drops. + for { + if _, _, err := conn.Read(r.Context()); err != nil { + return + } + } + } + })) + cluster.nodes = append(cluster.nodes, mn) + } + t.Cleanup(func() { + for _, n := range cluster.nodes { + n.alive.Store(false) + n.srv.Close() + } + }) + return cluster +} + +// rolesPrimaryReplicaReplica produces the standard tag closure for +// failover tests where the first node is the primary and the rest +// are replicas. Mirrors the typical QuestDB cluster topology. +func rolesPrimaryReplicaReplica() func(int) (byte, string, string) { + return func(idx int) (byte, string, string) { + if idx == 0 { + return qwpRolePrimary, fmt.Sprintf("node-%d", idx), "test-cluster" + } + return qwpRoleReplica, fmt.Sprintf("node-%d", idx), "test-cluster" + } +} + +// rolesAllReplicas tags every node REPLICA — used to test +// QwpRoleMismatchError when target=primary cannot find a match. +func rolesAllReplicas() func(int) (byte, string, string) { + return func(idx int) (byte, string, string) { + return qwpRoleReplica, fmt.Sprintf("replica-%d", idx), "test-cluster" + } +} + +// --- Tests --- + +// TestQwpClientConnectsToFirstMatchingTarget verifies that the +// connect walk binds to the first endpoint whose role passes the +// filter. With target=primary and a primary-then-replicas cluster, +// the client picks node 0. +func TestQwpClientConnectsToFirstMatchingTarget(t *testing.T) { + cluster := newMockCluster(t, 3, rolesPrimaryReplicaReplica(), nil) + + cfg := qwpQueryDefaultConfig() + eps, err := parseEndpointList(cluster.addrList(), qwpDefaultPort) + if err != nil { + t.Fatalf("parseEndpointList: %v", err) + } + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 2 * time.Second + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + if c.CurrentEndpoint() != cluster.nodes[0].addr() { + t.Errorf("currentEndpoint = %s, want %s", + c.CurrentEndpoint(), cluster.nodes[0].addr()) + } + info := c.ServerInfo() + if info == nil { + t.Fatal("ServerInfo nil after v2 connect") + } + if info.Role != qwpRolePrimary { + t.Errorf("role = %s, want PRIMARY", info.RoleName()) + } + if info.NodeId != "node-0" { + t.Errorf("nodeId = %q, want node-0", info.NodeId) + } +} + +// TestQwpClientWalksPastReplicasToPrimary verifies that the walk +// skips role-mismatched endpoints and lands on the first matching +// one further down the list. +func TestQwpClientWalksPastReplicasToPrimary(t *testing.T) { + // Two replicas first, then a primary at index 2. + cluster := newMockCluster(t, 3, func(idx int) (byte, string, string) { + role := qwpRoleReplica + if idx == 2 { + role = qwpRolePrimary + } + return role, fmt.Sprintf("node-%d", idx), "test-cluster" + }, nil) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 2 * time.Second + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + if c.CurrentEndpoint() != cluster.nodes[2].addr() { + t.Errorf("bound to %s, want %s (node-2 is the primary)", + c.CurrentEndpoint(), cluster.nodes[2].addr()) + } +} + +// TestQwpClientRoleMismatchSurfacesTypedError verifies that the walk +// returns *QwpRoleMismatchError with the last observed SERVER_INFO +// when target=primary but every endpoint reports REPLICA. +func TestQwpClientRoleMismatchSurfacesTypedError(t *testing.T) { + cluster := newMockCluster(t, 2, rolesAllReplicas(), nil) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 2 * time.Second + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, err := newQwpQueryClient(ctx, cfg) + if err == nil { + t.Fatal("expected QwpRoleMismatchError") + } + var rme *QwpRoleMismatchError + if !errors.As(err, &rme) { + t.Fatalf("err = %v (%T), want *QwpRoleMismatchError", err, err) + } + if rme.Target != "primary" { + t.Errorf("Target = %q, want primary", rme.Target) + } + if rme.LastObserved == nil { + t.Fatal("LastObserved should be populated") + } + if rme.LastObserved.Role != qwpRoleReplica { + t.Errorf("LastObserved.Role = %s, want REPLICA", + rme.LastObserved.RoleName()) + } + if !strings.Contains(rme.Error(), "primary") { + t.Errorf("Error string %q missing target", rme.Error()) + } +} + +// TestQwpClientInitialConnectProbesEachEndpointOnce pins the +// initial-connect contract after the host-tracker rewrite: the single +// fall-through BeginRound(forgetClassifications=true) re-sweep is +// reconnect-only (allowFallthroughReset). On initial connect a +// uniformly role-rejecting cluster must probe each endpoint exactly +// once and then fail — not double every probe by re-sweeping the same +// just-rejected hosts. Go analog of Java +// QwpQueryClientMultiHostFailoverTest.testConnectDoesNotDoubleWalkOnFirstFailure. +func TestQwpClientInitialConnectProbesEachEndpointOnce(t *testing.T) { + cluster := newMockCluster(t, 3, rolesAllReplicas(), nil) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetPrimary // every endpoint REPLICA → all role-reject + cfg.serverInfoTimeout = 2 * time.Second + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, err := newQwpQueryClient(ctx, cfg) + if err == nil { + t.Fatal("expected role-mismatch error when all endpoints are replicas") + } + var rme *QwpRoleMismatchError + if !errors.As(err, &rme) { + t.Fatalf("err = %v (%T), want *QwpRoleMismatchError", err, err) + } + for i := range cluster.nodes { + if got := cluster.nodes[i].onConnectCount.Load(); got != 1 { + t.Errorf("endpoint idx=%d upgraded %d times, want exactly 1 "+ + "(no fall-through re-sweep on initial connect)", i, got) + } + } +} + +// TestQwpClientRoleMismatchPreservesTransportError verifies that when +// the connect walk encounters a mix of transport failures and role +// mismatches under target=primary, the returned QwpRoleMismatchError +// carries both the last observed SERVER_INFO and the last underlying +// transport error so callers can tell network problems from pure role +// mismatch and reach the dial error via errors.As / Unwrap. +func TestQwpClientRoleMismatchPreservesTransportError(t *testing.T) { + // Endpoint A: refuses the WebSocket upgrade with 503 — generates a + // transport-level dial error. + srvFail := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer srvFail.Close() + // Endpoint B: a healthy REPLICA — accepted at the transport layer + // but rejected by the target=primary filter, so it lands as a role + // mismatch with an observed SERVER_INFO. + srvReplica := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, fmt.Sprintf("%d", qwpVersion)) + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + info := buildServerInfoFrame(qwpVersion, 0, qwpRoleReplica, + 1, 0, time.Now().UnixNano(), "test-cluster", "node-replica") + if err := conn.Write(r.Context(), websocket.MessageBinary, info); err != nil { + return + } + for { + if _, _, err := conn.Read(r.Context()); err != nil { + return + } + } + })) + defer srvReplica.Close() + + addrList := strings.TrimPrefix(srvFail.URL, "http://") + "," + + strings.TrimPrefix(srvReplica.URL, "http://") + cfg := qwpQueryDefaultConfig() + eps, err := parseEndpointList(addrList, qwpDefaultPort) + if err != nil { + t.Fatalf("parseEndpointList: %v", err) + } + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 500 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, err = newQwpQueryClient(ctx, cfg) + if err == nil { + t.Fatal("expected QwpRoleMismatchError") + } + var rme *QwpRoleMismatchError + if !errors.As(err, &rme) { + t.Fatalf("err = %v (%T), want *QwpRoleMismatchError", err, err) + } + if rme.LastObserved == nil || rme.LastObserved.Role != qwpRoleReplica { + t.Errorf("LastObserved = %+v, want the REPLICA endpoint's SERVER_INFO", + rme.LastObserved) + } + if rme.LastTransportError == nil { + t.Fatal("LastTransportError = nil, want the dial failure from the 503 endpoint") + } + if !errors.Is(err, rme.LastTransportError) { + t.Errorf("errors.Is(err, LastTransportError) = false, want true via Unwrap") + } + if !strings.Contains(rme.Error(), "last transport error") { + t.Errorf("Error string %q missing transport-error hint", rme.Error()) + } +} + +// TestQwpClientPrimaryAcceptsStandalone verifies the OSS-friendly +// rule that target=primary also accepts STANDALONE — the role v1 +// servers report when replication is not configured. Without this, +// every single-node OSS deployment would refuse target=primary. +func TestQwpClientPrimaryAcceptsStandalone(t *testing.T) { + cluster := newMockCluster(t, 1, func(int) (byte, string, string) { + return qwpRoleStandalone, "solo", "oss" + }, nil) + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 2 * time.Second + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + if c.ServerInfo().Role != qwpRoleStandalone { + t.Errorf("role = %s, want STANDALONE", c.ServerInfo().RoleName()) + } +} + +// TestQwpFailoverYieldsResetThenResumes drives the full transparent- +// failover happy path: the first server emits a transport failure +// mid-query, the client reconnects to the second server and replays +// the QUERY_REQUEST, and the iterator yields *QwpFailoverReset +// followed by the new generation's batches. +func TestQwpFailoverYieldsResetThenResumes(t *testing.T) { + type nodeState struct { + failOnce atomic.Bool + } + states := make([]*nodeState, 2) + for i := range states { + states[i] = &nodeState{} + } + // Node 0 fails the first connection's query (closes the conn + // after reading QUERY_REQUEST). Node 1 serves successfully. + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, frame, err := m.conn.Read(ctx) + _ = frame + if err != nil { + return + } + if idx == 0 && states[0].failOnce.CompareAndSwap(false, true) { + // Force a transport-terminal failure. + m.conn.Close(websocket.StatusInternalError, "simulated fault") + return + } + // Node 1: respond with one batch then RESULT_END. + frameBytes := buildOneRowInt64Batch(t, 1, 0, "v", 99) + m.sendBinary(ctx, frameBytes) + m.sendBinary(ctx, writeQwpFrame(0, + buildResultEndBody(1, 0, 1))) + // Hold open so client can close cleanly. + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 10 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select v from t") + defer q.Close() + + var ( + gotReset bool + gotBatch bool + ) + for batch, err := range q.Batches() { + if err != nil { + var reset *QwpFailoverReset + if errors.As(err, &reset) { + gotReset = true + if reset.NewNode == nil || reset.NewNode.NodeId != "node-1" { + t.Errorf("reset.NewNode = %+v, want node-1", reset.NewNode) + } + continue + } + t.Fatalf("unexpected error: %v", err) + } + gotBatch = true + if got := batch.Int64(0, 0); got != 99 { + t.Errorf("batch value = %d, want 99", got) + } + } + + if !gotReset { + t.Error("expected *QwpFailoverReset yield, got none") + } + if !gotBatch { + t.Error("expected to receive a batch from the new generation") + } + if c.CurrentEndpoint() != cluster.nodes[1].addr() { + t.Errorf("after failover bound to %s, want %s", + c.CurrentEndpoint(), cluster.nodes[1].addr()) + } +} + +// TestQwpFailoverRetriesSoleTargetMatchInsteadOfRoleMismatch pins the +// host-tracker reconnect contract (failover.md §2 / wire-egress.md +// §11.9.3, Java reconnectViaTracker): RecordMidStreamFailure demotes +// the just-failed endpoint to TransportError but it stays a candidate. +// With three endpoints where only the middle one passes target=primary +// and that sole primary flaps, the reconnect walk prefers the +// healthier/role-rejected peers first but, finding none of them +// target-acceptable, rebinds the demoted-but-only primary rather than +// declaring a role mismatch — a primary demonstrably exists, it is +// just dropping the connection. The query therefore yields +// *QwpFailoverReset events and finally exhausts the attempt budget as +// *QwpFailoverExhaustedError, NOT *QwpRoleMismatchError. +// +// This replaces the pre-failover-spec TestQwpFailoverSkipsJustFailed- +// Endpoint, whose "skip the failed index for one walk, then surface a +// role mismatch" assertion described the (failedIdx+1)%n modulo walk +// that the tracker rewrite removed. +func TestQwpFailoverRetriesSoleTargetMatchInsteadOfRoleMismatch(t *testing.T) { + // idx=0 REPLICA, idx=1 PRIMARY, idx=2 REPLICA. Only the primary + // passes target=primary, so initial bind lands on idx=1. + cluster := newMockCluster(t, 3, func(idx int) (byte, string, string) { + role := qwpRoleReplica + if idx == 1 { + role = qwpRolePrimary + } + return role, fmt.Sprintf("node-%d", idx), "test-cluster" + }, + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + // Drain the QUERY_REQUEST then close the socket to simulate + // a transport-terminal fault. + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "simulated fault") + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetPrimary + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 5 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 10 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + if c.CurrentEndpoint() != cluster.nodes[1].addr() { + t.Fatalf("initial bind = %s, want %s (the only primary)", + c.CurrentEndpoint(), cluster.nodes[1].addr()) + } + + q := c.Query(ctx, "select v from t") + defer q.Close() + + var ( + resets int + terminalErr error + ) + for _, err := range q.Batches() { + if err == nil { + t.Errorf("unexpected non-error batch from a poisoned connection") + continue + } + var reset *QwpFailoverReset + if errors.As(err, &reset) { + resets++ + // Every reconnect must rebind the sole primary, not a + // replica — the (state, zone) lattice keeps idx=1 the only + // target-acceptable candidate. + if reset.NewNode == nil || reset.NewNode.NodeId != "node-1" { + t.Errorf("reset.NewNode = %+v, want node-1 (the sole primary)", + reset.NewNode) + } + continue + } + terminalErr = err + } + + // The sole primary flaps, so the loop must keep rebinding it and + // yield a reset each time until the attempt budget is spent. + if resets == 0 { + t.Error("expected the sole primary to be rebound (>=1 *QwpFailoverReset)") + } + if terminalErr == nil { + t.Fatal("expected a terminal error after the attempt budget is spent") + } + // A primary exists and was rebound every time, so this is budget + // exhaustion — NOT a role mismatch (the old modulo walk wrongly + // reported "no endpoint matches target=primary" here). + var rme *QwpRoleMismatchError + if errors.As(terminalErr, &rme) { + t.Errorf("terminal err = %v, must NOT be *QwpRoleMismatchError: "+ + "a primary exists and was rebound", terminalErr) + } + var exhausted *QwpFailoverExhaustedError + if !errors.As(terminalErr, &exhausted) { + t.Fatalf("terminal err = %v (%T), want errors.As to match "+ + "*QwpFailoverExhaustedError", terminalErr, terminalErr) + } + if exhausted.Attempts != cfg.failoverMaxAttempts { + t.Errorf("exhausted.Attempts = %d, want %d (failoverMaxAttempts)", + exhausted.Attempts, cfg.failoverMaxAttempts) + } + + // The sole primary is rebound on every reconnect (initial bind + + // one upgrade per failover attempt), proving the tracker demotes + // but does NOT permanently skip it. + if got := cluster.nodes[1].onConnectCount.Load(); got != int64(cfg.failoverMaxAttempts) { + t.Errorf("primary at idx=1 upgraded %d times, want %d "+ + "(rebound every reconnect, not skipped)", + got, cfg.failoverMaxAttempts) + } + // The replicas never become the bound endpoint; they are only ever + // role-rejected, so each is probed at most once. + for _, ri := range []int{0, 2} { + if got := cluster.nodes[ri].onConnectCount.Load(); got > 1 { + t.Errorf("replica at idx=%d upgraded %d times, want <=1 "+ + "(role-rejected, never preferred again)", ri, got) + } + } +} + +// TestQwpFailoverDisabledSurfacesTransportError verifies that with +// failoverEnabled=false, a transport-terminal failure mid-query +// surfaces directly through Batches() instead of triggering replay. +func TestQwpFailoverDisabledSurfacesTransportError(t *testing.T) { + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "simulated fault") + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = false + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select 1") + defer q.Close() + + var sawErr bool + for _, err := range q.Batches() { + if err != nil { + var reset *QwpFailoverReset + if errors.As(err, &reset) { + t.Errorf("got reset with failover disabled") + continue + } + sawErr = true + } + } + if !sawErr { + t.Error("expected transport error to surface, got none") + } +} + +// TestQwpFailoverRespectsMaxAttempts verifies that after exhausting +// failoverMaxAttempts the iterator surfaces a typed +// *QwpFailoverExhaustedError rather than the underlying transport +// error and rather than looping forever. The exhaustion error must +// carry the attempt count and unwrap to the most recent transport +// failure so callers can errors.As against both shapes. +func TestQwpFailoverRespectsMaxAttempts(t *testing.T) { + // Both nodes always fail; max_attempts = 3 means we get 3 + // connect attempts total before giving up. + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "always fail") + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 1 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 5 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select 1") + defer q.Close() + + var ( + resets int + terminalErrs []error + ) + for _, err := range q.Batches() { + if err == nil { + continue + } + var reset *QwpFailoverReset + if errors.As(err, &reset) { + resets++ + continue + } + terminalErrs = append(terminalErrs, err) + } + if len(terminalErrs) != 1 { + t.Fatalf("terminalErrors = %d, want 1: %v", len(terminalErrs), terminalErrs) + } + // Resets should be < failoverMaxAttempts because the budget + // includes the initial submission. + if resets >= cfg.failoverMaxAttempts { + t.Errorf("resets = %d, expected < failoverMaxAttempts (%d)", + resets, cfg.failoverMaxAttempts) + } + // Exhaustion must surface as a typed *QwpFailoverExhaustedError + // so callers can distinguish "ran out of retries" from "first + // attempt failed". The message MUST identify exhaustion and + // SHOULD carry the attempt count and the most recent + // transport-failure message — assert all three. + terminalErr := terminalErrs[0] + var exhausted *QwpFailoverExhaustedError + if !errors.As(terminalErr, &exhausted) { + t.Fatalf("terminal err = %v (%T), want errors.As to match *QwpFailoverExhaustedError", + terminalErr, terminalErr) + } + if exhausted.Attempts != cfg.failoverMaxAttempts { + t.Errorf("exhausted.Attempts = %d, want %d (failoverMaxAttempts)", + exhausted.Attempts, cfg.failoverMaxAttempts) + } + if exhausted.LastError == nil { + t.Error("exhausted.LastError = nil, want the underlying transport error") + } + if !strings.Contains(terminalErr.Error(), "failover exhausted") { + t.Errorf("terminal err = %q, want it to identify failover exhaustion", + terminalErr.Error()) + } + if !strings.Contains(terminalErr.Error(), "last error:") { + t.Errorf("terminal err = %q, want it to include the last transport-failure message", + terminalErr.Error()) + } +} + +// TestQwpFailoverRespectsMaxDuration verifies that the wall-clock +// failover budget (failover_max_duration_ms) ends the loop even when +// failoverMaxAttempts is set high enough that the attempt cap would +// never fire. Exhaustion must still surface as a typed +// *QwpFailoverExhaustedError, and the attempt count must be far below +// the attempt cap — proving the duration budget, not the attempt cap, +// was the binding constraint. Mirrors Java's combined give-up test +// (attempt >= max || now >= deadline) at QwpQueryClient.java:1541. +func TestQwpFailoverRespectsMaxDuration(t *testing.T) { + // Both nodes always fail; the attempt cap is set absurdly high so + // only the wall-clock budget can end the loop. + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "always fail") + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 1 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 100000 // never the binding constraint + cfg.failoverBackoffInitial = 5 * time.Millisecond + cfg.failoverBackoffMax = 20 * time.Millisecond + cfg.failoverMaxDuration = 80 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select 1") + defer q.Close() + + start := time.Now() + var terminalErrs []error + for _, err := range q.Batches() { + if err == nil { + continue + } + var reset *QwpFailoverReset + if errors.As(err, &reset) { + continue + } + terminalErrs = append(terminalErrs, err) + } + elapsed := time.Since(start) + + if len(terminalErrs) != 1 { + t.Fatalf("terminalErrors = %d, want 1: %v", len(terminalErrs), terminalErrs) + } + terminalErr := terminalErrs[0] + var exhausted *QwpFailoverExhaustedError + if !errors.As(terminalErr, &exhausted) { + t.Fatalf("terminal err = %v (%T), want errors.As to match *QwpFailoverExhaustedError", + terminalErr, terminalErr) + } + // The duration cap, not the attempt cap, must have ended the loop: + // attempts must be >= 1 and nowhere near failoverMaxAttempts. + if exhausted.Attempts < 1 || exhausted.Attempts >= cfg.failoverMaxAttempts { + t.Errorf("exhausted.Attempts = %d, want in [1, %d) — duration budget should bind first", + exhausted.Attempts, cfg.failoverMaxAttempts) + } + if exhausted.LastError == nil { + t.Error("exhausted.LastError = nil, want the underlying transport error") + } + if !strings.Contains(terminalErr.Error(), "failover exhausted") { + t.Errorf("terminal err = %q, want it to identify failover exhaustion", + terminalErr.Error()) + } + if !strings.Contains(terminalErr.Error(), "last error:") { + t.Errorf("terminal err = %q, want it to include the last transport-failure message", + terminalErr.Error()) + } + // Sanity: giving up on the wall-clock budget must be prompt, not a + // run through 100000 attempts. Generous bound to stay non-flaky on + // loaded CI while still catching a broken/missing deadline check. + if elapsed > 3*time.Second { + t.Errorf("failover took %v, want prompt give-up on the ~80ms budget", elapsed) + } +} + +// TestQwpQueryErrorIsNotRetried verifies the kind-split contract: +// a server-emitted QUERY_ERROR (e.g. a SQL parse error) surfaces +// directly to the user without any failover attempt, even with +// failover enabled. Only client-side transport-terminal events +// trigger the reconnect path. +func TestQwpQueryErrorIsNotRetried(t *testing.T) { + connectCount := atomic.Int64{} + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + connectCount.Add(1) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, frame, err := m.conn.Read(ctx) + _ = frame + if err != nil { + return + } + // Send QUERY_ERROR with status=ParseError. The kind- + // split routes this to the user, not to the failover + // loop. + body := []byte{byte(qwpMsgKindQueryError)} + body = appendInt64LE(body, 1) // requestId + body = append(body, byte(QwpStatusParseError)) + msg := "syntax error" + body = appendUint16LE(body, uint16(len(msg))) + body = append(body, msg...) + m.sendBinary(ctx, writeQwpFrame(0, body)) + // Hold open. + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 5 + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select bogus") + defer q.Close() + + var qe *QwpQueryError + var resetCount int + for _, err := range q.Batches() { + if err == nil { + continue + } + var r *QwpFailoverReset + if errors.As(err, &r) { + resetCount++ + continue + } + errors.As(err, &qe) + } + // Initial connect = 1, no replay attempts. + if got := connectCount.Load(); got != 1 { + t.Errorf("connectCount = %d, want 1 (no failover for QUERY_ERROR)", got) + } + if resetCount != 0 { + t.Errorf("resetCount = %d, want 0", resetCount) + } + if qe == nil { + t.Fatal("expected *QwpQueryError, got none") + } + if qe.Status != QwpStatusParseError { + t.Errorf("status = 0x%02X, want PARSE_ERROR", byte(qe.Status)) + } +} + +// TestQwpExecDefaultDoesNotReplayOnTransportDrop verifies that with +// replayExec=false (the default), a transport drop mid-Exec does NOT +// reconnect-and-resubmit: the (possibly already-applied) statement is +// never silently re-executed on a fresh connection. The caller gets a +// raw transport error (not *QwpFailoverReset) and the second node is +// never contacted. +func TestQwpExecDefaultDoesNotReplayOnTransportDrop(t *testing.T) { + first := atomic.Bool{} + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + if idx == 0 && first.CompareAndSwap(false, true) { + // Simulate the server having committed the INSERT, then + // the transport dropping before the EXEC_DONE ack lands. + m.conn.Close(websocket.StatusInternalError, "fault") + return + } + // Reaching any node other than node 0's first connection + // means the client reconnected and re-sent the INSERT — + // exactly the silent double-execution replay_exec=off must + // prevent. Fail loudly from the server goroutine. + t.Errorf("node %d received a connection: Exec replayed a "+ + "non-idempotent statement with replay_exec=off", idx) + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 5 * time.Millisecond + cfg.replayExec = false + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + _, err = c.Exec(ctx, "INSERT INTO t VALUES (1)") + if err == nil { + t.Fatal("expected a transport error from Exec with replayExec=false") + } + // The error must NOT be a failover reset: surfacing one would imply + // a successful reconnect-and-replay happened. + var reset *QwpFailoverReset + if errors.As(err, &reset) { + t.Fatalf("err is *QwpFailoverReset (%v): replay_exec=off must "+ + "not reconnect-and-replay a non-idempotent Exec", err) + } + // Nor a failover-exhausted error: we must bail before any retry + // budget is consumed, not after exhausting it. + var exhausted *QwpFailoverExhaustedError + if errors.As(err, &exhausted) { + t.Fatalf("err is *QwpFailoverExhaustedError (%v): replay_exec=off "+ + "must not enter the retry loop at all", err) + } + // Proof the statement was not re-sent: node 0 was connected exactly + // once (initial connect, then faulted) and node 1 was never reached. + if got := cluster.nodes[0].onConnectCount.Load(); got != 1 { + t.Errorf("node 0 connectCount = %d, want 1 (single submit, no replay)", got) + } + if got := cluster.nodes[1].onConnectCount.Load(); got != 0 { + t.Errorf("node 1 connectCount = %d, want 0 (no reconnect)", got) + } +} + +// TestQwpExecOptInReplaysTransparently verifies that with +// replayExec=true, Exec retries transparently on transport drop and +// returns the new generation's ExecResult to the caller. +func TestQwpExecOptInReplaysTransparently(t *testing.T) { + first := atomic.Bool{} + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + if idx == 0 && first.CompareAndSwap(false, true) { + m.conn.Close(websocket.StatusInternalError, "fault") + return + } + body := []byte{byte(qwpMsgKindExecDone)} + body = appendInt64LE(body, 2) // replay requestId + body = append(body, QwpOpTypeInsert) // op_type + body = append(body, 1) // rowsAffected varint = 1 + m.sendBinary(ctx, writeQwpFrame(0, body)) + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 2 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 5 * time.Millisecond + cfg.replayExec = true + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + res, err := c.Exec(ctx, "INSERT INTO t VALUES (1)") + if err != nil { + t.Fatalf("Exec failed unexpectedly: %v", err) + } + // target=any binds endpoint 0 (the primary) first, and the mock + // faults its first connection — so a successful Exec here can only + // mean the replay path actually ran. Assert the fault fired, + // otherwise the test would pass vacuously if the reconnect logic + // regressed into never hitting the faulted node. + if !first.Load() { + t.Fatal("primary's first connection was never faulted; replay path not exercised") + } + // The replayed EXEC_DONE must decode into the result Exec returns — + // distinctive values prove the frame flowed through, not a zero + // value from some short-circuit. + if res.OpType != QwpOpTypeInsert { + t.Errorf("OpType = %d, want %d (QwpOpTypeInsert)", res.OpType, QwpOpTypeInsert) + } + if res.RowsAffected != 1 { + t.Errorf("RowsAffected = %d, want 1", res.RowsAffected) + } +} + +// TestQwpFailoverCancelDuringBackoff verifies that Cancel during the +// failover backoff sleep aborts the replay rather than completing +// the wait. Uses a small but non-trivial backoff so the cancel +// observably interrupts the sleep. +func TestQwpFailoverCancelDuringBackoff(t *testing.T) { + cluster := newMockCluster(t, 2, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "always fail") + }) + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 1 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 5 + cfg.failoverBackoffInitial = 200 * time.Millisecond + cfg.failoverBackoffMax = 200 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select 1") + defer q.Close() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + // Wait for the first reset to occur, then cancel. + time.Sleep(50 * time.Millisecond) + q.Cancel() + }() + + start := time.Now() + for _, err := range q.Batches() { + _ = err + } + elapsed := time.Since(start) + wg.Wait() + + // Without cancel interruption, the test would burn through the + // full failover budget (5 * 200ms = 1s+). With interruption it + // should exit much faster. + if elapsed > 800*time.Millisecond { + t.Errorf("elapsed = %v, expected fast cancel exit", elapsed) + } +} + +// TestQwpFailoverCancelDuringWalk verifies that Cancel during the +// reconnect's connectWalk phase short-circuits at the next endpoint +// boundary instead of burning a full timeout per remaining endpoint. +// Node 0 succeeds initially and then drops the connection on the +// query; nodes 1..3 hang at SERVER_INFO so each attempted bind costs +// one serverInfoTimeout. Without the boundary cancel poll the walk +// would cost 3 × serverInfoTimeout; with it, the walk exits after one +// timeout once the cancel flag is observed. +func TestQwpFailoverCancelDuringWalk(t *testing.T) { + cluster := newMockCluster(t, 4, rolesPrimaryReplicaReplica(), + func(idx int, m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if idx == 0 { + // Drain the QUERY_REQUEST then close to simulate a + // transport-terminal fault. + _, _, _ = m.conn.Read(ctx) + m.conn.Close(websocket.StatusInternalError, "simulated fault") + return + } + // Nodes 1..3 never reach the handler — suppressServerInfo + // holds them at the upgrade barrier. Defensive idle loop. + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + for i := 1; i < 4; i++ { + cluster.nodes[i].suppressServerInfo.Store(true) + } + + cfg := qwpQueryDefaultConfig() + eps, _ := parseEndpointList(cluster.addrList(), qwpDefaultPort) + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 500 * time.Millisecond + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 10 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "select 1") + defer q.Close() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + // Cancel well before the first slow endpoint's timeout fires + // so the boundary poll has the flag set when the walk + // progresses to the second slow endpoint. + time.Sleep(100 * time.Millisecond) + q.Cancel() + }() + + start := time.Now() + for _, err := range q.Batches() { + _ = err + } + elapsed := time.Since(start) + wg.Wait() + + // Without the boundary poll the first walk visits all three slow + // endpoints (3 × 500ms = 1.5s); with it the walk exits after the + // first endpoint's timeout (~500ms) plus negligible overhead. Use + // 1s as the threshold to give CI machines headroom while still + // distinguishing the two regimes. + if elapsed > 1*time.Second { + t.Errorf("elapsed = %v, expected boundary cancel after one endpoint timeout", elapsed) + } +} + +// TestQwpComputeBackoffFullJitter verifies the egress backoff is +// full-jitter [0, base) per failover.md §3.1 (Java reference +// QwpQueryClient.java:1557-1568): the 1-based double-on-each-step +// schedule, capped at max, sets the ceiling; the returned sleep is +// drawn uniformly below it so co-tenants don't dial in lockstep. +// Sampling-based — it asserts the [0, base) envelope and that the +// draw genuinely spans it, which rules out a regression to a +// deterministic schedule (old behaviour: always == base) or to the +// ingress equal-jitter shape [base, 2·base). +func TestQwpComputeBackoffFullJitter(t *testing.T) { + cfg := &qwpQueryClientConfig{ + failoverBackoffInitial: 50 * time.Millisecond, + failoverBackoffMax: 1 * time.Second, + } + // base is the pre-jitter ceiling: initial doubled per step, + // capped at max. computeBackoff must return a draw in [0, base). + bases := []struct { + attempt int + base time.Duration + }{ + {1, 50 * time.Millisecond}, + {2, 100 * time.Millisecond}, + {3, 200 * time.Millisecond}, + {4, 400 * time.Millisecond}, + {5, 800 * time.Millisecond}, + {6, 1 * time.Second}, // capped + {20, 1 * time.Second}, // capped + } + const samples = 4000 + for _, tc := range bases { + minSeen := tc.base + maxSeen := time.Duration(-1) + for i := 0; i < samples; i++ { + got := computeBackoff(cfg, tc.attempt) + if got < 0 || got >= tc.base { + t.Fatalf("computeBackoff(attempt=%d) = %v, want [0, %v)", + tc.attempt, got, tc.base) + } + if got < minSeen { + minSeen = got + } + if got > maxSeen { + maxSeen = got + } + } + // Full-jitter spans [0, base): across thousands of draws the + // minimum must dip below base/2 and the maximum must rise + // above it. This is the signature that separates full-jitter + // from a deterministic return (min==max==base, also caught by + // the envelope check) and from ingress equal-jitter + // [base, 2·base) (every draw would be >= base). P(all draws + // land on one side of base/2) ≈ 2·2^-4000, so neither bound + // is flaky. + half := tc.base / 2 + if minSeen >= half { + t.Errorf("attempt=%d: min sample %v >= base/2 %v; "+ + "expected full-jitter to dip into [0, base/2)", + tc.attempt, minSeen, half) + } + if maxSeen < half { + t.Errorf("attempt=%d: max sample %v < base/2 %v; "+ + "expected full-jitter to reach into [base/2, base)", + tc.attempt, maxSeen, half) + } + } + + // attempt < 1 means "no sleep before the very first try" — the + // caller has not yet failed an attempt, so there is nothing to + // back off from. Zero, never jittered. + for _, attempt := range []int{0, -1, -100} { + if got := computeBackoff(cfg, attempt); got != 0 { + t.Errorf("computeBackoff(attempt=%d) = %v, want 0 "+ + "(no pre-first-try sleep)", attempt, got) + } + } + + // initial=0 disables backoff entirely, mirroring Java's + // `if (failoverInitialBackoffMs > 0L)` guard. Without the + // early return, the `d <= 0` overflow branch would fall + // through to max for every attempt >= 1. + zeroInitial := &qwpQueryClientConfig{ + failoverBackoffInitial: 0, + failoverBackoffMax: 1 * time.Second, + } + for _, attempt := range []int{0, 1, 2, 5, 100} { + if got := computeBackoff(zeroInitial, attempt); got != 0 { + t.Errorf("computeBackoff(initial=0, attempt=%d) = %v, want 0", + attempt, got) + } + } + + // A non-positive cap collapses the schedule before the jitter + // draw: rand.Int63n(0) panics, so the d <= 0 guard must + // short-circuit to zero. With initial>0 but max=0 the doubling + // result always exceeds max, forcing d to the non-positive cap. + zeroMax := &qwpQueryClientConfig{ + failoverBackoffInitial: 50 * time.Millisecond, + failoverBackoffMax: 0, + } + for _, attempt := range []int{1, 2, 5, 100} { + if got := computeBackoff(zeroMax, attempt); got != 0 { + t.Errorf("computeBackoff(max=0, attempt=%d) = %v, want 0", + attempt, got) + } + } +} + +// gatedQwpServer stands up an httptest WebSocket server that negotiates +// qwpVersion and emits SERVER_INFO only after `release` is +// closed. onReached is closed (once) the moment a connection has been +// upgraded and is parked waiting for the gate — i.e. the client is now +// blocked inside transport.connect()'s SERVER_INFO read, which (on the +// failover path) means reconnectAndReplay is inside connectWalk holding +// c.genMu. After the gate opens it answers every QUERY_REQUEST with a +// RESULT_END so the consumer terminates cleanly, then signals onClosed +// (once) when the client tears the connection down. The onClosed signal +// is the leak probe: only a generation that something calls shutdown() +// on ever closes its WebSocket. +func gatedQwpServer(t *testing.T, nodeId string, release <-chan struct{}, + onReached, onClosed *sync.Once, reached, closed chan struct{}) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, fmt.Sprintf("%d", qwpVersion)) + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + onReached.Do(func() { close(reached) }) + select { + case <-release: + case <-r.Context().Done(): + return + } + info := buildServerInfoFrame(qwpVersion, 0, qwpRolePrimary, + 2, 0, time.Now().UnixNano(), "test-cluster", nodeId) + if err := conn.Write(r.Context(), websocket.MessageBinary, info); err != nil { + onClosed.Do(func() { close(closed) }) + return + } + for { + typ, frame, err := conn.Read(r.Context()) + if err != nil { + // Client tore the connection down — the generation that + // owns this socket had shutdown() called on it. + onClosed.Do(func() { close(closed) }) + return + } + if typ != websocket.MessageBinary || len(frame) < 9 || + frame[0] != byte(qwpMsgKindQueryRequest) { + continue + } + reqId := int64(binary.LittleEndian.Uint64(frame[1:9])) + end := writeQwpFrame(0, buildResultEndBody(reqId, 0, 0)) + end[4] = qwpVersion // match negotiated version + if err := conn.Write(r.Context(), websocket.MessageBinary, end); err != nil { + onClosed.Do(func() { close(closed) }) + return + } + } + })) +} + +// TestQwpQueryCloseRacingFailoverDoesNotLeakGeneration is a regression +// test for the close-vs-reconnect leak: Close() running while +// reconnectAndReplay is mid connectWalk used to consume closeOnce +// against the dying generation, after which reconnectAndReplay +// published a fresh generation (reader + dispatcher + waiter goroutines +// + a live WebSocket) that nothing ever called shutdown() on — leaked +// for the process lifetime. +// +// Node A binds initially then drops the connection on the query, +// forcing failover. Node B is the only other candidate and gates its +// SERVER_INFO write, so the test can call Close() with the failover +// provably parked inside connectWalk (holding c.genMu). The fix makes +// Close take c.genMu to set closed + snapshot the bound pair, and makes +// reconnectAndReplay refuse to publish (and self-tear-down) a +// generation built while closing. Either way the failover target's +// WebSocket must end up closed by the client; pre-fix it never was. +func TestQwpQueryCloseRacingFailoverDoesNotLeakGeneration(t *testing.T) { + var ( + bReleaseGate = make(chan struct{}) + bReached = make(chan struct{}) + bClosed = make(chan struct{}) + bReachedOnce, bClosed1 sync.Once + ) + + // Node A: emits SERVER_INFO, reads the QUERY_REQUEST, then drops the + // socket to simulate a transport-terminal fault and trigger failover. + nodeA := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, fmt.Sprintf("%d", qwpVersion)) + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + info := buildServerInfoFrame(qwpVersion, 0, qwpRolePrimary, + 1, 0, time.Now().UnixNano(), "test-cluster", "node-a") + if err := conn.Write(r.Context(), websocket.MessageBinary, info); err != nil { + return + } + _, _, _ = conn.Read(r.Context()) // the QUERY_REQUEST + conn.Close(websocket.StatusInternalError, "simulated fault") + })) + defer nodeA.Close() + + nodeB := gatedQwpServer(t, "node-b", bReleaseGate, + &bReachedOnce, &bClosed1, bReached, bClosed) + defer nodeB.Close() + + cfg := qwpQueryDefaultConfig() + eps, err := parseEndpointList( + strings.TrimPrefix(nodeA.URL, "http://")+","+ + strings.TrimPrefix(nodeB.URL, "http://"), qwpDefaultPort) + if err != nil { + t.Fatalf("parseEndpointList: %v", err) + } + cfg.endpoints = eps + cfg.target = qwpTargetAny + cfg.serverInfoTimeout = 5 * time.Second + cfg.failoverEnabled = true + cfg.failoverMaxAttempts = 3 + cfg.failoverBackoffInitial = 1 * time.Millisecond + cfg.failoverBackoffMax = 5 * time.Millisecond + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + c, err := newQwpQueryClient(ctx, cfg) + if err != nil { + t.Fatalf("newQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + if c.CurrentEndpoint() != strings.TrimPrefix(nodeA.URL, "http://") { + t.Fatalf("initial bind = %s, want node A", c.CurrentEndpoint()) + } + + var qwg sync.WaitGroup + qwg.Add(1) + go func() { + defer qwg.Done() + qctx, qcancel := context.WithTimeout(context.Background(), 8*time.Second) + defer qcancel() + q := c.Query(qctx, "select 1") + defer q.Close() + for _, err := range q.Batches() { + if err == nil { + continue + } + var reset *QwpFailoverReset + if errors.As(err, &reset) { + continue // consume the new generation's frames + } + // Any terminal error (incl. the close-during-failover + // transport error) ends iteration — that is expected here. + break + } + }() + + // Wait until the failover reconnect is provably parked inside + // connectWalk on node B (holding c.genMu), then Close from another + // goroutine — the exact interleaving that used to leak. + select { + case <-bReached: + case <-time.After(10 * time.Second): + t.Fatal("failover did not reach node B") + } + + closeDone := make(chan error, 1) + go func() { + cctx, ccancel := context.WithTimeout(context.Background(), 5*time.Second) + defer ccancel() + closeDone <- c.Close(cctx) + }() + // Best-effort nudge so Close() is blocked on c.genMu while + // reconnectAndReplay still holds it (the most interesting + // interleaving). Not a correctness requirement — every interleaving + // is leak-free post-fix. + time.Sleep(75 * time.Millisecond) + close(bReleaseGate) + + // The leak probe: post-fix the freshly built generation is torn + // down (by Close's snapshot, or by reconnectAndReplay's self- + // teardown), so node B's WebSocket is closed by the client. Pre-fix + // nothing ever calls shutdown() on it and this never fires. + select { + case <-bClosed: + case <-time.After(6 * time.Second): + t.Fatal("regression: failover-target connection was never closed " + + "by the client — reconnectAndReplay published a generation " + + "that Close() leaked") + } + + select { + case err := <-closeDone: + if err != nil { + t.Errorf("Close returned %v, want nil", err) + } + case <-time.After(6 * time.Second): + t.Fatal("Close did not return") + } + + done := make(chan struct{}) + go func() { qwg.Wait(); close(done) }() + select { + case <-done: + case <-time.After(8 * time.Second): + t.Fatal("query goroutine did not unwind after Close") + } + + if !c.closed.Load() { + t.Error("client closed flag not set after Close") + } + q := c.Query(ctx, "select 1") + var sawClosed bool + for _, err := range q.Batches() { + if err != nil && strings.Contains(err.Error(), "closed") { + sawClosed = true + } + } + q.Close() + if !sawClosed { + t.Error("Query after Close did not surface a closed-client error") + } +} diff --git a/qwp_fuzz_fixture_test.go b/qwp_fuzz_fixture_test.go new file mode 100644 index 00000000..66db1a00 --- /dev/null +++ b/qwp_fuzz_fixture_test.go @@ -0,0 +1,815 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// QuestDB server fixture for the QWP fuzz tests. Locates a QuestDB +// distribution, launches it on freshly discovered ports, waits until +// the HTTP service answers /ping, and exposes Stop/Start/Bounce plus +// an /exec SQL helper so the fuzz tests can drive a real server end +// to end. +// +// Server resolution order (first hit wins): +// +// 1. QDB_FUZZ_ADDR=host:httpPort — talk to an already-running server. +// The fixture does not own its lifecycle, so Bounce is unavailable +// (bounce-dependent tests skip themselves). +// 2. QDB_JAR=/path/to/questdb-*.jar — launch this jar. +// 3. QDB_REPO=/path/to/questdb — glob core/target for the built +// questdb-*-SNAPSHOT.jar. +// 4. A sibling ../questdb (or ../../questdb) checkout, same glob. +// +// When none of these resolve (and no JDK is found) the fuzz tests skip, +// so the normal `go test ./...` run on a box without QuestDB stays green. +// The dedicated CI job builds QuestDB from source and sets QDB_REPO. + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "io/fs" + "net" + "net/http" + "net/url" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "testing" + "time" +) + +const ( + fuzzServerStartTimeout = 180 * time.Second + fuzzServerStopTimeout = 30 * time.Second + fuzzServerPingPeriod = 200 * time.Millisecond +) + +// qwpFuzzServer is a launched (or externally provided) QuestDB instance +// shared by every fuzz test in a `go test` run. +type qwpFuzzServer struct { + // owns is false in QDB_FUZZ_ADDR mode: we connect but never manage + // the process, so Bounce returns an error. + owns bool + + javaPath string + jarPath string + + baseDir string // temp root, removed on stop() + dataDir string // QuestDB -d directory + confDir string // dataDir/conf + logPath string + + host string + httpPort int + lineTCPort int + pgPort int + + // envOverrides is appended to the JVM child's environment so a + // per-instance fixture can flip server config keys at boot (e.g. + // QDB_HTTP_RECV_BUFFER_SIZE for the small-buffer fuzz test). + // nil/empty on the shared singleton; populated by bootSidecarServer. + envOverrides map[string]string + + mu sync.Mutex + cmd *exec.Cmd + waitCh chan struct{} + waitErr error + logFile *os.File +} + +var ( + fuzzServerOnce sync.Once + fuzzServerShared *qwpFuzzServer + fuzzServerSkip string + fuzzServerErr error +) + +// TestMain guarantees the shared fuzz server is torn down once the whole +// package test run finishes. Without this the launched JVM would leak +// past `go test` since Go has no other per-package teardown hook. +func TestMain(m *testing.M) { + code := m.Run() + if fuzzServerShared != nil { + fuzzServerShared.stop() + } + os.Exit(code) +} + +// fuzzStrict reports whether an unavailable server must FAIL the test +// instead of skipping it. The dedicated qwp-fuzz CI workflow sets +// QDB_FUZZ_STRICT=1 so a misconfigured build (no jar produced, server +// won't boot, wrong path) is a loud red failure instead of a silent +// green skip that never actually fuzzes anything. The regular +// `go test ./...` run leaves it unset and skips cleanly. It is a +// dedicated opt-in env var, NOT derived from CI=true, because the +// ordinary build.yml job also runs in CI but has no jar and must skip. +func fuzzStrict() bool { + v := strings.TrimSpace(strings.ToLower(os.Getenv("QDB_FUZZ_STRICT"))) + return v != "" && v != "0" && v != "false" && v != "no" +} + +// fuzzServer returns the shared QuestDB instance, booting it on first +// use. A resolved-but-unstartable server always fails the test. An +// unresolvable server skips — unless QDB_FUZZ_STRICT is set, in which +// case it fails (see fuzzStrict). +func fuzzServer(t *testing.T) *qwpFuzzServer { + t.Helper() + fuzzServerOnce.Do(func() { + fuzzServerShared, fuzzServerSkip, fuzzServerErr = launchFuzzServer() + }) + if fuzzServerErr != nil { + t.Fatalf("fuzz server failed to start: %v", fuzzServerErr) + } + if fuzzServerSkip != "" { + if fuzzStrict() { + t.Fatalf("QDB_FUZZ_STRICT is set but the fuzz server is unavailable "+ + "(this must run in CI, not skip): %s", fuzzServerSkip) + } + t.Skip(fuzzServerSkip) + } + return fuzzServerShared +} + +// launchFuzzServer resolves and starts a server. The string return is a +// non-empty skip reason when the environment simply isn't set up for +// fuzzing (no error, just nothing to run against). +func launchFuzzServer() (*qwpFuzzServer, string, error) { + if addr := strings.TrimSpace(os.Getenv("QDB_FUZZ_ADDR")); addr != "" { + host, portStr, err := net.SplitHostPort(addr) + if err != nil { + return nil, "", fmt.Errorf("QDB_FUZZ_ADDR %q: %w", addr, err) + } + port, err := strconv.Atoi(portStr) + if err != nil { + return nil, "", fmt.Errorf("QDB_FUZZ_ADDR %q: bad port: %w", addr, err) + } + s := &qwpFuzzServer{owns: false, host: host, httpPort: port} + if err := s.waitHTTPReady(fuzzServerStartTimeout); err != nil { + return nil, "", fmt.Errorf("QDB_FUZZ_ADDR %q not reachable: %w", addr, err) + } + return s, "", nil + } + + javaPath, err := findJava() + if err != nil { + return nil, "no JDK found (set JAVA_HOME or PATH); set QDB_FUZZ_ADDR to use a running server", nil + } + jarPath, err := findQuestDBJar() + if err != nil { + return nil, "no QuestDB jar found (set QDB_JAR, QDB_REPO, or build a sibling ../questdb); or set QDB_FUZZ_ADDR", nil + } + + baseDir, err := os.MkdirTemp("", "qwpfuzz-") + if err != nil { + return nil, "", fmt.Errorf("mkdtemp: %w", err) + } + s := &qwpFuzzServer{ + owns: true, + javaPath: javaPath, + jarPath: jarPath, + baseDir: baseDir, + dataDir: filepath.Join(baseDir, "data"), + host: "127.0.0.1", + } + s.confDir = filepath.Join(s.dataDir, "conf") + s.logPath = filepath.Join(s.dataDir, "log", "log.txt") + for _, d := range []string{s.confDir, filepath.Dir(s.logPath)} { + if err := os.MkdirAll(d, 0o755); err != nil { + os.RemoveAll(baseDir) + return nil, "", fmt.Errorf("mkdir %s: %w", d, err) + } + } + // Best-effort: QuestDB serves /exec and QWP without mime.types, but + // copying it (as fixture.py does) silences a startup warning and + // matches the proven layout. + copyMimeTypes(jarPath, s.confDir) + + if err := s.discoverPorts(); err != nil { + os.RemoveAll(baseDir) + return nil, "", err + } + if err := s.start(); err != nil { + log := s.tailLog(4000) + s.stop() + return nil, "", fmt.Errorf("%w\n--- QuestDB log tail ---\n%s", err, log) + } + return s, "", nil +} + +// bootSidecarServer launches a private QuestDB instance for ONE test, +// independent of the shared singleton, with the given env overrides +// applied to the JVM child. Used by fuzz tests that need a server-side +// config knob the singleton doesn't expose (e.g. small recv buffer, +// forced wire fragmentation). The instance is torn down via t.Cleanup. +// +// Requires fixture-launched mode (Java + jar resolvable). Skips in +// QDB_FUZZ_ADDR mode (external server we can't restart with custom +// env). Honours QDB_FUZZ_STRICT exactly like the shared fixture: a +// resolved-but-unstartable server always fails; an unresolved one +// fails under STRICT and skips otherwise. +func bootSidecarServer(t *testing.T, envOverrides map[string]string) *qwpFuzzServer { + t.Helper() + if strings.TrimSpace(os.Getenv("QDB_FUZZ_ADDR")) != "" { + t.Skip("sidecar server requires fixture-launched mode (QDB_FUZZ_ADDR is set — external server we can't restart with custom env)") + } + javaPath, err := findJava() + if err != nil { + if fuzzStrict() { + t.Fatalf("QDB_FUZZ_STRICT is set but no JDK is available for sidecar boot: %v", err) + } + t.Skip("no JDK found for sidecar boot") + } + jarPath, err := findQuestDBJar() + if err != nil { + if fuzzStrict() { + t.Fatalf("QDB_FUZZ_STRICT is set but no QuestDB jar is available for sidecar boot: %v", err) + } + t.Skip("no QuestDB jar found for sidecar boot") + } + baseDir, err := os.MkdirTemp("", "qwpfuzz-sidecar-") + if err != nil { + t.Fatalf("sidecar mkdtemp: %v", err) + } + s := &qwpFuzzServer{ + owns: true, + javaPath: javaPath, + jarPath: jarPath, + baseDir: baseDir, + dataDir: filepath.Join(baseDir, "data"), + host: "127.0.0.1", + envOverrides: envOverrides, + } + s.confDir = filepath.Join(s.dataDir, "conf") + s.logPath = filepath.Join(s.dataDir, "log", "log.txt") + for _, d := range []string{s.confDir, filepath.Dir(s.logPath)} { + if err := os.MkdirAll(d, 0o755); err != nil { + os.RemoveAll(baseDir) + t.Fatalf("sidecar mkdir %s: %v", d, err) + } + } + copyMimeTypes(jarPath, s.confDir) + if err := s.discoverPorts(); err != nil { + os.RemoveAll(baseDir) + t.Fatalf("sidecar ports: %v", err) + } + if err := s.start(); err != nil { + log := s.tailLog(4000) + s.stop() + t.Fatalf("sidecar start: %v\n--- QuestDB log tail ---\n%s", err, log) + } + t.Cleanup(s.stop) + return s +} + +// findJava mirrors fixture.py:_find_java — prefer $JAVA_HOME/bin/java, +// fall back to PATH. +func findJava() (string, error) { + if jh := strings.TrimSpace(os.Getenv("JAVA_HOME")); jh != "" { + cand := filepath.Join(jh, "bin", "java") + if fi, err := os.Stat(cand); err == nil && !fi.IsDir() { + return cand, nil + } + } + return exec.LookPath("java") +} + +// findQuestDBJar mirrors fixture.py:install_questdb_from_repo's jar +// discovery (core/target/**/questdb*-SNAPSHOT.jar), plus direct QDB_JAR. +func findQuestDBJar() (string, error) { + if jar := strings.TrimSpace(os.Getenv("QDB_JAR")); jar != "" { + if fi, err := os.Stat(jar); err == nil && !fi.IsDir() { + return jar, nil + } + return "", fmt.Errorf("QDB_JAR=%q does not exist", jar) + } + + var repos []string + if r := strings.TrimSpace(os.Getenv("QDB_REPO")); r != "" { + repos = append(repos, r) + } + // Sibling checkouts relative to the test working directory (the + // package dir, e.g. .../go-questdb-client). + repos = append(repos, filepath.Join("..", "questdb"), filepath.Join("..", "..", "questdb")) + + for _, repo := range repos { + if jar := pickNewestServerJar(filepath.Join(repo, "core", "target")); jar != "" { + abs, _ := filepath.Abs(jar) + return abs, nil + } + } + return "", errors.New("questdb-*-SNAPSHOT.jar not found") +} + +// isServerJar matches the QuestDB server jar and rejects the sibling +// -tests / -sources / -javadoc jars (the glob "questdb*-SNAPSHOT.jar" +// already excludes "-SNAPSHOT-tests.jar" by suffix, but be explicit). +func isServerJar(name string) bool { + if !strings.HasPrefix(name, "questdb") || !strings.HasSuffix(name, "-SNAPSHOT.jar") { + return false + } + for _, bad := range []string{"-tests", "-sources", "-javadoc"} { + if strings.Contains(name, bad) { + return false + } + } + return true +} + +// pickNewestServerJar returns the most recently modified server jar +// under dir (glob + a deeper walk for nested layouts), or "". Newest +// wins so a stale jar from an older build/version never shadows a fresh +// one — CI clones fresh so there is only one, but local dev trees +// accumulate multiple SNAPSHOT versions. +func pickNewestServerJar(dir string) string { + seen := map[string]struct{}{} + var best string + var bestMod time.Time + consider := func(path string) { + if _, dup := seen[path]; dup { + return + } + seen[path] = struct{}{} + if !isServerJar(filepath.Base(path)) { + return + } + fi, err := os.Stat(path) + if err != nil || fi.IsDir() { + return + } + if best == "" || fi.ModTime().After(bestMod) { + best, bestMod = path, fi.ModTime() + } + } + matches, _ := filepath.Glob(filepath.Join(dir, "questdb*-SNAPSHOT.jar")) + for _, m := range matches { + consider(m) + } + _ = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return nil + } + if !d.IsDir() { + consider(path) + } + return nil + }) + return best +} + +func copyMimeTypes(jarPath, destConfDir string) { + src := filepath.Join(filepath.Dir(jarPath), "classes", "io", "questdb", "site", "conf", "mime.types") + in, err := os.Open(src) + if err != nil { + return + } + defer in.Close() + out, err := os.Create(filepath.Join(destConfDir, "mime.types")) + if err != nil { + return + } + defer out.Close() + _, _ = io.Copy(out, in) +} + +// discoverPorts grabs three free TCP ports for http / line.tcp / pg. Same +// bind-then-close hack as fixture.py:discover_avail_ports — racy but fine +// for tests, and reused verbatim across a Bounce so the server rebinds +// the same ports. +func (s *qwpFuzzServer) discoverPorts() error { + ports := make([]int, 0, 3) + listeners := make([]net.Listener, 0, 3) + for i := 0; i < 3; i++ { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + for _, x := range listeners { + x.Close() + } + return fmt.Errorf("discover ports: %w", err) + } + listeners = append(listeners, l) + ports = append(ports, l.Addr().(*net.TCPAddr).Port) + } + for _, l := range listeners { + l.Close() + } + s.httpPort, s.lineTCPort, s.pgPort = ports[0], ports[1], ports[2] + return nil +} + +// serverConf mirrors fixture.py's generated server.conf for the non-auth, +// non-UDP fuzz path. QWP-over-WebSocket rides the HTTP port with no extra +// config. +func (s *qwpFuzzServer) serverConf() string { + return fmt.Sprintf(`http.bind.to=0.0.0.0:%d +line.tcp.net.bind.to=0.0.0.0:%d +pg.net.bind.to=0.0.0.0:%d +http.min.enabled=false +line.udp.enabled=false +qwp.udp.enabled=false +line.tcp.maintenance.job.interval=100 +line.tcp.min.idle.ms.before.writer.release=300 +telemetry.enabled=false +cairo.commit.lag=100 +cairo.writer.data.append.page.size=64k +cairo.writer.data.index.value.append.page.size=64k +line.tcp.commit.interval.fraction=0.1 +`, s.httpPort, s.lineTCPort, s.pgPort) +} + +// start writes the config and launches the JVM, blocking until /ping +// answers 204 or the process dies / times out. Idempotent: if a JVM is +// already managed by this fixture, returns nil immediately (so a +// defensive t.Cleanup(start) is safe regardless of test state). +func (s *qwpFuzzServer) start() error { + if !s.owns { + return nil + } + s.mu.Lock() + already := s.cmd != nil + s.mu.Unlock() + if already { + return nil + } + if err := os.WriteFile(filepath.Join(s.confDir, "server.conf"), []byte(s.serverConf()), 0o644); err != nil { + return fmt.Errorf("write server.conf: %w", err) + } + f, err := os.OpenFile(s.logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) + if err != nil { + return fmt.Errorf("open log: %w", err) + } + + // Verbatim from fixture.py:launch_args. "-Dnoebug" is QuestDB's own + // (deliberately misspelled) debug-off switch — do not "fix" it. + cmd := exec.Command(s.javaPath, + "-DQuestDB-Runtime-0", + "-ea", + "-Dnoebug", + "-XX:+UnlockExperimentalVMOptions", + "-XX:+AlwaysPreTouch", + "-p", s.jarPath, + "-m", "io.questdb/io.questdb.ServerMain", + "-d", s.dataDir, + ) + cmd.Dir = s.dataDir + cmd.Stdout = f + cmd.Stderr = f + if len(s.envOverrides) > 0 { + // Strip any pre-existing values for the override keys before + // appending ours. POSIX leaves the behaviour of duplicate names + // in execve's envp unspecified, and getenv() in some libc + // implementations returns the FIRST entry — so an inherited + // QDB_=... would silently win over our override. Dedup is + // load-bearing for correctness, not stylistic. + cmd.Env = make([]string, 0, len(os.Environ())+len(s.envOverrides)) + for _, kv := range os.Environ() { + eq := strings.IndexByte(kv, '=') + if eq < 0 { + cmd.Env = append(cmd.Env, kv) + continue + } + if _, override := s.envOverrides[kv[:eq]]; override { + continue + } + cmd.Env = append(cmd.Env, kv) + } + for k, v := range s.envOverrides { + cmd.Env = append(cmd.Env, k+"="+v) + } + } + + s.mu.Lock() + if err := cmd.Start(); err != nil { + s.mu.Unlock() + f.Close() + return fmt.Errorf("start java: %w", err) + } + s.cmd = cmd + s.logFile = f + s.waitCh = make(chan struct{}) + waitCh := s.waitCh + s.mu.Unlock() + + go func() { + err := cmd.Wait() + s.mu.Lock() + s.waitErr = err + s.mu.Unlock() + close(waitCh) + }() + + // Make the server launch visible in CI logs (the point of the + // qwp-fuzz job is that it actually starts a server — a silent skip + // would be a false green). + fmt.Fprintf(os.Stderr, "[qwp-fuzz] launched QuestDB pid=%d jar=%s http=127.0.0.1:%d; waiting for /ping\n", + cmd.Process.Pid, s.jarPath, s.httpPort) + + deadline := time.Now().Add(fuzzServerStartTimeout) + for { + select { + case <-waitCh: + return fmt.Errorf("QuestDB exited during startup: %v", s.waitErr) + default: + } + if s.pingOK() { + fmt.Fprintf(os.Stderr, "[qwp-fuzz] QuestDB ready on 127.0.0.1:%d\n", s.httpPort) + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for QuestDB /ping", fuzzServerStartTimeout) + } + time.Sleep(fuzzServerPingPeriod) + } +} + +// waitHTTPReady is the external-mode (QDB_FUZZ_ADDR) readiness probe. +func (s *qwpFuzzServer) waitHTTPReady(timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + if s.pingOK() { + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for /ping", timeout) + } + time.Sleep(fuzzServerPingPeriod) + } +} + +func (s *qwpFuzzServer) pingOK() bool { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + fmt.Sprintf("http://%s:%d/ping", s.host, s.httpPort), nil) + if err != nil { + return false + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return false + } + _ = resp.Body.Close() + return resp.StatusCode == http.StatusNoContent +} + +// pause stops the JVM (SIGTERM with kill fallback) without touching the +// data directory or discovered ports — a subsequent start() boots a +// fresh JVM that adopts the same dataDir and rebinds the same ports. +// Idempotent and a no-op in QDB_FUZZ_ADDR mode. Underlies bounce() (the +// bouncer primitive for the ingress-oracle bounce-torture and +// restart-replay ports) and stop() (which additionally rm's the data +// dir on teardown), and is the primitive the ingress-oracle +// async-connect port calls to arrange a "server not listening yet" +// state. +func (s *qwpFuzzServer) pause() { + if !s.owns { + return + } + s.mu.Lock() + cmd, waitCh, logFile := s.cmd, s.waitCh, s.logFile + s.cmd, s.waitCh, s.logFile = nil, nil, nil + s.mu.Unlock() + + if cmd != nil && cmd.Process != nil { + _ = cmd.Process.Signal(syscall.SIGTERM) + select { + case <-waitCh: + case <-time.After(fuzzServerStopTimeout): + _ = cmd.Process.Kill() + <-waitCh + } + } + if logFile != nil { + logFile.Close() + } +} + +// kill abruptly terminates the JVM with SIGKILL — no graceful shutdown +// hooks run, no in-flight WS ACKs leave the worker pool, and the OS +// tears down listening + accepted sockets via RST. The abrupt +// counterpart to pause()'s SIGTERM. Required by the restart-replay +// tests in qwp_ingress_server_restart_fuzz_test.go that need to leave +// the client's SF disk holding genuinely-unacked frames across Close: +// SIGTERM lets the JVM's shutdown hooks ack everything before exit, +// which empirically full-drains those tests' 500-5000-row batches +// every time and skips the "frames stay on disk through close" code +// path. Idempotent and a no-op in QDB_FUZZ_ADDR mode. +func (s *qwpFuzzServer) kill() { + if !s.owns { + return + } + s.mu.Lock() + cmd, waitCh, logFile := s.cmd, s.waitCh, s.logFile + s.cmd, s.waitCh, s.logFile = nil, nil, nil + s.mu.Unlock() + + if cmd != nil && cmd.Process != nil { + _ = cmd.Process.Kill() + <-waitCh + } + if logFile != nil { + logFile.Close() + } +} + +// stop terminates the JVM (via pause()) and removes the temp data dir. +// Called once at TestMain teardown; not re-entry-safe with start(). +func (s *qwpFuzzServer) stop() { + s.pause() + if s.owns && s.baseDir != "" { + os.RemoveAll(s.baseDir) + } +} + +// bounce restarts the server on the same ports and data dir, exercising +// the client's reconnect/replay path: SIGTERM, ~500ms down, then a fresh +// JVM rebinds the identical ports and dataDir (no data loss — only stop() +// removes baseDir). Consumed by the ingress-oracle bounce-torture test. +// Returns an error in QDB_FUZZ_ADDR mode (the fixture does not own that +// process, so bounce-dependent tests skip themselves). +func (s *qwpFuzzServer) bounce() error { + if !s.owns { + return errors.New("cannot bounce a server in QDB_FUZZ_ADDR mode") + } + s.pause() + // Brief settle so the OS can release the listening sockets before + // the new JVM rebinds the same ports (fixture.py BounceThread does + // the same with a short randomized sleep). + time.Sleep(500 * time.Millisecond) + return s.start() +} + +func (s *qwpFuzzServer) tailLog(n int) string { + if s.logPath == "" { + return "(no log)" + } + b, err := os.ReadFile(s.logPath) + if err != nil { + return fmt.Sprintf("(log unreadable: %v)", err) + } + if len(b) > n { + b = b[len(b)-n:] + } + return string(b) +} + +// connConf is the QWP connection string for senders / query clients. +func (s *qwpFuzzServer) connConf() string { + return fmt.Sprintf("ws::addr=%s:%d;", s.host, s.httpPort) +} + +// wsAddr is the host:port for QWP senders that assemble their own +// connection string (sf_dir / reconnect / auto_flush tuning) instead of +// using connConf — used by the ingress-oracle bounce-torture test. +func (s *qwpFuzzServer) wsAddr() string { + return fmt.Sprintf("%s:%d", s.host, s.httpPort) +} + +// execSQL runs SQL via the HTTP /exec endpoint (used for DDL/DML setup +// and oracle read-back), returning the parsed result or the server's +// error message. +func (s *qwpFuzzServer) execSQL(sql string) (qwpTableResult, error) { + u, err := url.Parse(fmt.Sprintf("http://%s:%d/exec", s.host, s.httpPort)) + if err != nil { + return qwpTableResult{}, err + } + q := url.Values{} + q.Set("query", sql) + u.RawQuery = q.Encode() + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) + if err != nil { + return qwpTableResult{}, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return qwpTableResult{}, err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return qwpTableResult{}, err + } + var withErr struct { + Error string `json:"error"` + } + if json.Unmarshal(body, &withErr) == nil && withErr.Error != "" { + return qwpTableResult{}, fmt.Errorf("server error: %s (sql=%q)", withErr.Error, sql) + } + var result qwpTableResult + if err := json.Unmarshal(body, &result); err != nil { + return qwpTableResult{}, fmt.Errorf("parse /exec response: %w (body=%s)", err, string(body)) + } + return result, nil +} + +func (s *qwpFuzzServer) mustExec(t *testing.T, sql string) qwpTableResult { + t.Helper() + r, err := s.execSQL(sql) + if err != nil { + t.Fatalf("execSQL: %v", err) + } + return r +} + +// dropAllTables clears the database (the _fuzz_loop.py model: one +// long-lived server, drop-all between tests). Consumed by the sender +// fuzz port, which auto-creates tables on first write and relies on +// a clean slate per test. +func (s *qwpFuzzServer) dropAllTables(t *testing.T) { + t.Helper() + res, err := s.execSQL("SHOW TABLES") + if err != nil { + t.Fatalf("SHOW TABLES: %v", err) + } + for _, row := range res.Dataset { + if len(row) == 0 { + continue + } + name, ok := row[0].(string) + if !ok { + continue + } + if _, err := s.execSQL("DROP TABLE IF EXISTS '" + name + "'"); err != nil { + t.Logf("warning: drop table %q: %v", name, err) + } + } +} + +// awaitRows polls until `table` has at least `want` rows or the deadline +// passes. Replaces the Java tests' in-process engine.awaitTable / WAL +// drain, which a network client cannot do. The last execSQL error (if +// any) is surfaced in the timeout message so "server unreachable the +// whole window" is distinguishable from "WAL never caught up". +func (s *qwpFuzzServer) awaitRows(t *testing.T, table string, want int, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + q := fmt.Sprintf("SELECT count() FROM '%s'", table) + var lastN int64 + var lastErr error + for { + res, err := s.execSQL(q) + if err != nil { + lastErr = err + } else if len(res.Dataset) == 1 && len(res.Dataset[0]) == 1 { + if n, ok := toInt64(res.Dataset[0][0]); ok { + lastN = n + if n >= int64(want) { + return + } + } + } + if time.Now().After(deadline) { + t.Fatalf("timeout: table %q reached %d / %d rows within %s (last execSQL err: %v)", + table, lastN, want, timeout, lastErr) + } + time.Sleep(100 * time.Millisecond) + } +} + +// toInt64 coerces a JSON-decoded numeric (float64 / json.Number / string) +// to int64. +func toInt64(v interface{}) (int64, bool) { + switch x := v.(type) { + case float64: + return int64(x), true + case json.Number: + n, err := x.Int64() + return n, err == nil + case string: + n, err := strconv.ParseInt(x, 10, 64) + return n, err == nil + default: + return 0, false + } +} diff --git a/qwp_fuzz_seed_test.go b/qwp_fuzz_seed_test.go new file mode 100644 index 00000000..139d38fe --- /dev/null +++ b/qwp_fuzz_seed_test.go @@ -0,0 +1,58 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +// Shared seeded-RNG helper for the QWP fuzz tests. Kept in its own +// build-tag-free file so both the server-bound fuzz tests (which carry +// //go:build !windows for graceful server teardown) and the pure, +// server-free decoder fuzz (which must run on every platform under the +// normal `go test ./...`) can use it. + +import ( + "math/rand" + "os" + "strconv" + "testing" + "time" +) + +// newFuzzRand builds a reproducible RNG. QWP_FUZZ_SEED pins the seed for +// replaying a failure; otherwise it is clock-derived and logged so a +// failing run is always reproducible. +func newFuzzRand(t *testing.T) *rand.Rand { + t.Helper() + var seed int64 + if s := os.Getenv("QWP_FUZZ_SEED"); s != "" { + v, err := strconv.ParseInt(s, 10, 64) + if err != nil { + t.Fatalf("QWP_FUZZ_SEED=%q: %v", s, err) + } + seed = v + } else { + seed = time.Now().UnixNano() + } + t.Logf("QWP_FUZZ_SEED=%d (set this env var to reproduce)", seed) + return rand.New(rand.NewSource(seed)) +} diff --git a/qwp_gorilla_decoder.go b/qwp_gorilla_decoder.go new file mode 100644 index 00000000..890f919a --- /dev/null +++ b/qwp_gorilla_decoder.go @@ -0,0 +1,264 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import "encoding/binary" + +// qwpBitReader reads bits LSB-first from a byte slice using a 64-bit +// accumulator. It is the inverse of qwpBitWriter in qwp_gorilla.go and +// is used by qwpGorillaDecoder to consume the delta-of-delta bitstream +// emitted by the encoder. +// +// Refills go through a single 8-byte little-endian load whenever the +// source has 8 bytes available, falling back to a byte-by-byte tail for +// the last <8 bytes of the buffer. The Gorilla DoD path issues several +// single-bit reads followed by a wide signed payload per row; once the +// accumulator is loaded, all reads up to its 64-bit capacity hit the +// fast path (a single shift+mask) without touching the source slice. +// +// Error model: every read returns *qwpDecodeError (via +// newQwpDecodeError) when the underlying byte slice is exhausted before +// the requested bits are available. The decoder caller bubbles these up +// as a decode failure on the enclosing RESULT_BATCH frame. +type qwpBitReader struct { + data []byte + bitBuffer uint64 // accumulator; bits are LSB-aligned, count is bitsAvail + bitsAvail int // bits currently held in bitBuffer; in [0, 64] + pos int // index of the next byte to load from data + bitsRead int64 // total bits consumed since reset +} + +// reset rebinds the reader to a new byte slice and zeroes all residual +// state. Safe to call before every column decode so leftovers from a +// prior column never bleed in. +func (r *qwpBitReader) reset(data []byte) { + r.data = data + r.bitBuffer = 0 + r.bitsAvail = 0 + r.pos = 0 + r.bitsRead = 0 +} + +// bytesConsumed returns ceil(bitsRead / 8) — the byte count of the +// bitstream region read so far, rounded up to the next byte boundary. +// Matches the encoder's byte-aligned output (qwpBitWriter.finish +// always pads trailing bits with zeros to a full byte). The 8-byte +// fast-path refill may speculatively load bytes beyond the bits the +// caller actually consumes; bytesConsumed reflects bits read, not +// bytes loaded, so it remains a faithful cursor for the outer reader. +func (r *qwpBitReader) bytesConsumed() int { return int((r.bitsRead + 7) >> 3) } + +// readBit reads a single bit, LSB-first within each source byte. +// Specialised so the hot Gorilla prefix-decoding path stays inlinable +// when the accumulator is already populated — the common case after +// the first refill. +func (r *qwpBitReader) readBit() (uint64, error) { + if r.bitsAvail >= 1 { + bit := r.bitBuffer & 1 + r.bitBuffer >>= 1 + r.bitsAvail-- + r.bitsRead++ + return bit, nil + } + return r.readBits(1) +} + +// readBits reads the low n bits of the stream and returns them +// LSB-aligned in a uint64. n must be in [0, 64]. n == 0 returns 0 +// without consuming any bits, matching the Java QwpBitReader contract +// — callers in the decoder occasionally pass a width derived from a +// runtime computation and rely on the zero case being a no-op. +// +// Mask construction is branchless via `^uint64(0) >> (64 - n)`: for n +// in [1, 64] the shift count is in [0, 63] and the result is the +// expected n-bit mask, with no n == 64 special case (`uint64(1) << 64` +// is 0 in Go, which would make the obvious `(1 << n) - 1` form wrong). +// The accumulator drain uses the same idea via two chained shifts so +// the inner shift count is always in [0, 63] and Go does not have to +// emit a runtime guard for shift-by-width. +func (r *qwpBitReader) readBits(n int) (uint64, error) { + if n == 0 { + return 0, nil + } + if n < 0 || n > 64 { + return 0, newQwpDecodeError("bit count out of range") + } + if r.bitsAvail >= n { + mask := ^uint64(0) >> (64 - n) + result := r.bitBuffer & mask + r.bitBuffer = (r.bitBuffer >> 1) >> (n - 1) + r.bitsAvail -= n + r.bitsRead += int64(n) + return result, nil + } + return r.readBitsSlow(n) +} + +// readBitsSlow is the cold path for reads that span a refill. Each +// iteration drains whatever's already in the accumulator, then refills +// — preferring an 8-byte LE load when 8 source bytes are available, +// falling back to a 1-byte load for the tail. Multi-iteration logic is +// only exercised when n exceeds the bits already buffered (which, after +// the first refill, can be at most one extra iteration since a single +// 64-bit accumulator load satisfies any n <= 64). +// +// Mask construction and accumulator drain use the same branchless +// idioms as readBits — `take` is in [1, 64] inside the loop body, so +// `^uint64(0) >> (64 - take)` and `(buf >> 1) >> (take - 1)` both have +// shift counts in [0, 63] and need no runtime guard. +func (r *qwpBitReader) readBitsSlow(n int) (uint64, error) { + var result uint64 + shift := 0 + remaining := n + for remaining > 0 { + if r.bitsAvail == 0 { + if r.pos+8 <= len(r.data) { + r.bitBuffer = binary.LittleEndian.Uint64(r.data[r.pos:]) + r.pos += 8 + r.bitsAvail = 64 + } else if r.pos < len(r.data) { + r.bitBuffer = uint64(r.data[r.pos]) + r.pos++ + r.bitsAvail = 8 + } else { + return 0, newQwpDecodeError("bit read past end of buffer") + } + } + take := remaining + if take > r.bitsAvail { + take = r.bitsAvail + } + mask := ^uint64(0) >> (64 - take) + result |= (r.bitBuffer & mask) << shift + r.bitBuffer = (r.bitBuffer >> 1) >> (take - 1) + r.bitsAvail -= take + shift += take + remaining -= take + } + r.bitsRead += int64(n) + return result, nil +} + +// readSigned reads n bits as a two's complement signed integer, +// sign-extending bit n-1 into the rest of the result. +func (r *qwpBitReader) readSigned(n int) (int64, error) { + u, err := r.readBits(n) + if err != nil { + return 0, err + } + if n < 64 && u&(uint64(1)<<(n-1)) != 0 { + u |= ^uint64(0) << n + } + return int64(u), nil +} + +// qwpGorillaDecoder reverses qwpGorillaEncoder: it consumes a delta-of- +// delta bitstream (without the two leading raw timestamps — the caller +// reads those out of band and passes them to reset) and yields one +// int64 timestamp per decodeNext call. +// +// Mirror of the Java QwpGorillaDecoder. Buckets and prefix patterns: +// +// "0" → DoD = 0 (1 bit) +// "10" + s7 → DoD in [-64, 63] (9 bits) +// "110" + s9 → DoD in [-256, 255] (12 bits) +// "1110"+ s12 → DoD in [-2048, 2047] (16 bits) +// "1111"+ s32 → any other DoD (36 bits) +// +// Prefix bits are read LSB-first, so the encoder's 0b01 for "10" is +// observed here as readBit=1 then readBit=0 — the leading 1 falls past +// the "b==0 → DoD=0" check, and the trailing 0 selects the 7-bit +// signed payload for the "10" bucket. +type qwpGorillaDecoder struct { + br qwpBitReader + prevTs int64 + prevDelta int64 +} + +// reset seeds the decoder with the two leading timestamps (read by the +// caller from the uncompressed prefix of the column's wire bytes) and +// the bitstream that follows them. After reset, the caller invokes +// decodeNext exactly nonNull-2 times; the first two timestamps are +// already known and returned outside this decoder. +func (d *qwpGorillaDecoder) reset(firstTs, secondTs int64, bitstream []byte) { + d.prevTs = secondTs + d.prevDelta = secondTs - firstTs + d.br.reset(bitstream) +} + +// decodeNext decodes one timestamp and advances the decoder's rolling +// state (prevTs, prevDelta). Errors bubble up as *qwpDecodeError from +// qwpBitReader when the bitstream is truncated. +func (d *qwpGorillaDecoder) decodeNext() (int64, error) { + dod, err := d.decodeDoD() + if err != nil { + return 0, err + } + delta := d.prevDelta + dod + ts := d.prevTs + delta + d.prevDelta = delta + d.prevTs = ts + return ts, nil +} + +// bytesConsumed proxies the underlying bit reader's byte accounting. +// Used by the RESULT_BATCH column parser to advance the outer byte +// reader past the bitstream region once decoding finishes. +func (d *qwpGorillaDecoder) bytesConsumed() int { return d.br.bytesConsumed() } + +// decodeDoD walks the bucket prefix tree. Each successive readBit +// refines the bucket; once a 0 bit or the all-ones path terminates the +// prefix, the remaining signed payload is read and returned. +func (d *qwpGorillaDecoder) decodeDoD() (int64, error) { + b, err := d.br.readBit() + if err != nil { + return 0, err + } + if b == 0 { + return 0, nil + } + b, err = d.br.readBit() + if err != nil { + return 0, err + } + if b == 0 { + return d.br.readSigned(7) + } + b, err = d.br.readBit() + if err != nil { + return 0, err + } + if b == 0 { + return d.br.readSigned(9) + } + b, err = d.br.readBit() + if err != nil { + return 0, err + } + if b == 0 { + return d.br.readSigned(12) + } + return d.br.readSigned(32) +} diff --git a/qwp_gorilla_decoder_test.go b/qwp_gorilla_decoder_test.go new file mode 100644 index 00000000..50d4b958 --- /dev/null +++ b/qwp_gorilla_decoder_test.go @@ -0,0 +1,614 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "errors" + "math/rand" + "testing" +) + +// --- qwpBitReader --- + +func TestQwpBitReaderLSBFirstRoundTrip(t *testing.T) { + // Inverse of TestQwpBitWriterLSBFirst: write bits 1,0,1,1 — the + // writer packs them LSB-first into byte 0x0D. Reading them back in + // the same order must return the same sequence. + var wb qwpWireBuffer + var bw qwpBitWriter + bw.reset(&wb) + bw.writeBit(1) + bw.writeBit(0) + bw.writeBit(1) + bw.writeBit(1) + bw.finish() + + var br qwpBitReader + br.reset(wb.bytes()) + for i, want := range []uint64{1, 0, 1, 1} { + got, err := br.readBit() + if err != nil { + t.Fatalf("readBit[%d]: %v", i, err) + } + if got != want { + t.Fatalf("readBit[%d] = %d, want %d", i, got, want) + } + } +} + +func TestQwpBitReaderSpanningBytes(t *testing.T) { + // Mirror of TestQwpBitWriterSpanningBytes: writer emits 0xABC in + // 12 bits across bytes [0xBC, 0x0A]. Reading 12 bits must return + // the same value. + var wb qwpWireBuffer + var bw qwpBitWriter + bw.reset(&wb) + bw.writeBits(0xABC, 12) + bw.finish() + + var br qwpBitReader + br.reset(wb.bytes()) + got, err := br.readBits(12) + if err != nil { + t.Fatalf("readBits: %v", err) + } + if got != 0xABC { + t.Fatalf("readBits(12) = %#X, want 0xABC", got) + } +} + +func TestQwpBitReaderSignedSignExtension(t *testing.T) { + // 7-bit field with bit 6 set must read back as -1. Encode it LSB- + // first as 0x7F (seven ones). + var wb qwpWireBuffer + var bw qwpBitWriter + bw.reset(&wb) + bw.writeSigned(-1, 7) + bw.finish() + var br qwpBitReader + br.reset(wb.bytes()) + got, err := br.readSigned(7) + if err != nil { + t.Fatalf("readSigned(7): %v", err) + } + if got != -1 { + t.Fatalf("readSigned(7) = %d, want -1", got) + } + + // 12-bit field with only bit 11 set = 0x800 — most-negative value + // -2048. + wb.reset() + bw.reset(&wb) + bw.writeSigned(-2048, 12) + bw.finish() + br.reset(wb.bytes()) + got, err = br.readSigned(12) + if err != nil { + t.Fatalf("readSigned(12): %v", err) + } + if got != -2048 { + t.Fatalf("readSigned(12) = %d, want -2048", got) + } +} + +func TestQwpBitReaderTruncated(t *testing.T) { + // One byte supplied, read 16 bits → error. + var br qwpBitReader + br.reset([]byte{0xFF}) + _, err := br.readBits(16) + if err == nil { + t.Fatalf("expected error reading past end") + } + var de *qwpDecodeError + if !errors.As(err, &de) { + t.Fatalf("expected *qwpDecodeError, got %T", err) + } +} + +func TestQwpBitReaderOutOfRangeBitCount(t *testing.T) { + // Guard against n<0 and n>64 — caller bugs would otherwise return + // garbage (mask computation relies on 1 <= n <= 64). n=0 is a + // no-op success path (matches Java contract); see + // TestQwpBitReaderReadBitsZeroIsNoop. + var br qwpBitReader + br.reset([]byte{0xFF}) + for _, n := range []int{-1, 65, 100} { + _, err := br.readBits(n) + if err == nil { + t.Fatalf("readBits(%d) should error", n) + } + } +} + +func TestQwpBitReaderReadBitsZeroIsNoop(t *testing.T) { + // Mirror of Java's testReadBitsZeroBitsReturnsZeroWithoutAdvancing: + // a zero-width read must yield 0 and leave the bit position + // unchanged so the next read still sees byte 0 intact. + var br qwpBitReader + br.reset([]byte{0xFF}) + got, err := br.readBits(0) + if err != nil { + t.Fatalf("readBits(0): %v", err) + } + if got != 0 { + t.Fatalf("readBits(0) = %d, want 0", got) + } + if br.bitsRead != 0 { + t.Fatalf("bitsRead after readBits(0) = %d, want 0", br.bitsRead) + } + if bit, err := br.readBit(); err != nil || bit != 1 { + t.Fatalf("readBit after readBits(0) = (%d, %v), want (1, nil)", bit, err) + } +} + +func TestQwpBitReaderReadBits64FullWord(t *testing.T) { + // 64-bit read must use the branchless mask path (^uint64(0) for + // n=64) and reproduce the input verbatim. Mirror of Java's + // testReadBits64ReadsFullWord. + value := uint64(0x0123456789ABCDEF) + src := make([]byte, 8) + binary.LittleEndian.PutUint64(src, value) + + var br qwpBitReader + br.reset(src) + got, err := br.readBits(64) + if err != nil { + t.Fatalf("readBits(64): %v", err) + } + if got != value { + t.Fatalf("readBits(64) = %#x, want %#x", got, value) + } + if br.bitsRead != 64 { + t.Fatalf("bitsRead = %d, want 64", br.bitsRead) + } +} + +func TestQwpBitReaderReadBits64TwiceDoesNotLeakStaleBuffer(t *testing.T) { + // Regression: a full-width readBits(64) must clear the + // accumulator so the next read sees a clean slate. Java's + // `bitBuffer >>>= 64` is a no-op (shift mod 64 == 0); the same + // pitfall applies in Go via `r.bitBuffer >> 1 >> 63 == bitBuffer` + // without the chained-shift form. Two disjoint halves (8 bytes + // of 0xFF then 8 of 0x00) catch the regression: the second 64-bit + // read must be exactly 0, not the OR of stale all-ones with the + // fresh zeros. + src := make([]byte, 16) + for i := 0; i < 8; i++ { + src[i] = 0xFF + } + + var br qwpBitReader + br.reset(src) + + first, err := br.readBits(64) + if err != nil { + t.Fatalf("first readBits(64): %v", err) + } + if first != ^uint64(0) { + t.Fatalf("first readBits(64) = %#x, want %#x", first, ^uint64(0)) + } + if br.bitsRead != 64 { + t.Fatalf("bitsRead after first read = %d, want 64", br.bitsRead) + } + + second, err := br.readBits(64) + if err != nil { + t.Fatalf("second readBits(64): %v", err) + } + if second != 0 { + t.Fatalf("second readBits(64) = %#x, want 0 (stale-buffer regression)", second) + } + if br.bitsRead != 128 { + t.Fatalf("bitsRead after second read = %d, want 128", br.bitsRead) + } +} + +func TestQwpBitReaderArbitraryWidths(t *testing.T) { + // Sequence of mixed widths within and across byte boundaries. + // Source: 0xFF, 0x55, 0xAA, 0x00 (32 bits). + // LSB-first decoding of byte 0xFF: 5 bits = 0b11111 = 0x1F, + // then remaining 3 bits = 0b111 = 0x7. Byte 0x55 read whole + // = 0x55. Last 16 bits combine 0xAA (low) and 0x00 (high) = + // 0x00AA. Mirrors Java's testReadBitsArbitraryWidths. + src := []byte{0xFF, 0x55, 0xAA, 0x00} + + var br qwpBitReader + br.reset(src) + + got, err := br.readBits(5) + if err != nil || got != 0x1F { + t.Fatalf("readBits(5) = (%#x, %v), want (0x1F, nil)", got, err) + } + got, err = br.readBits(3) + if err != nil || got != 0x7 { + t.Fatalf("readBits(3) = (%#x, %v), want (0x7, nil)", got, err) + } + if br.bitsRead != 8 { + t.Fatalf("bitsRead after byte 0 = %d, want 8", br.bitsRead) + } + got, err = br.readBits(8) + if err != nil || got != 0x55 { + t.Fatalf("readBits(8) = (%#x, %v), want (0x55, nil)", got, err) + } + got, err = br.readBits(16) + if err != nil || got != 0x00AA { + t.Fatalf("readBits(16) = (%#x, %v), want (0x00AA, nil)", got, err) + } + if br.bitsRead != 32 { + t.Fatalf("bitsRead final = %d, want 32", br.bitsRead) + } +} + +func TestQwpBitReaderSpansSlowPathRefills(t *testing.T) { + // 24-bit read must traverse the refill loop in readBitsSlow more + // than once when the accumulator is empty and the source has + // fewer than 8 bytes (forces the byte-by-byte refill branch). + // LSB-first across [0x01, 0x02, 0x03, 0x00] = 0x030201. + src := []byte{0x01, 0x02, 0x03, 0x00} + + var br qwpBitReader + br.reset(src) + got, err := br.readBits(24) + if err != nil { + t.Fatalf("readBits(24): %v", err) + } + if got != 0x030201 { + t.Fatalf("readBits(24) = %#x, want 0x030201", got) + } + if br.bitsRead != 24 { + t.Fatalf("bitsRead = %d, want 24", br.bitsRead) + } +} + +func TestQwpBitReaderMultiRefillAcrossLargeBuffer(t *testing.T) { + // Walk a 16-byte buffer with a sequence of widths summing to + // 128 bits. Each width forces an accumulator refill at a + // different boundary point, and the trailing readBit must + // surface the past-end error. Mirror of Java's + // testReadBitsAcrossLargeRefill. + src := make([]byte, 16) + for i := range src { + src[i] = byte(i) + } + + var br qwpBitReader + br.reset(src) + widths := []int{1, 7, 13, 19, 23, 33, 32} + totalBits := int64(0) + for _, w := range widths { + if _, err := br.readBits(w); err != nil { + t.Fatalf("readBits(%d): %v", w, err) + } + totalBits += int64(w) + if br.bitsRead != totalBits { + t.Fatalf("bitsRead after readBits(%d) = %d, want %d", w, br.bitsRead, totalBits) + } + } + if _, err := br.readBit(); err == nil { + t.Fatalf("readBit after exhausting 128 bits should error") + } +} + +func TestQwpBitReaderSignedDoesNotExtendWhenMsbClear(t *testing.T) { + // 5-bit field with MSB clear: encode +5 (0b00101), read back as + // +5 — sign-extension must NOT fire for MSB=0. Mirrors Java's + // testReadSignedDoesNotExtendWhenMsbClear. + var br qwpBitReader + br.reset([]byte{0b00000101}) + got, err := br.readSigned(5) + if err != nil { + t.Fatalf("readSigned(5): %v", err) + } + if got != 5 { + t.Fatalf("readSigned(5) = %d, want 5", got) + } +} + +func TestQwpBitReaderSigned64BitsBehavesLikeReadBits(t *testing.T) { + // readSigned(64) special-cases the sign-extend so the value + // already occupies the full int64 unchanged. Mirror of Java's + // testReadSigned64BitsBehavesLikeReadBits. + want := int64(-0x0011223344556678) // i.e. 0xFFEEDDCCBBAA9988 + src := make([]byte, 8) + binary.LittleEndian.PutUint64(src, uint64(want)) + + var br qwpBitReader + br.reset(src) + got, err := br.readSigned(64) + if err != nil { + t.Fatalf("readSigned(64): %v", err) + } + if got != want { + t.Fatalf("readSigned(64) = %d, want %d", got, want) + } +} + +func TestQwpBitReaderResetClearsAllState(t *testing.T) { + // After a partial read on buffer 1, reset(buffer2) must reseed + // the position to 0 and force the first read to come from + // buffer2 — not from leftover bits in the accumulator. Mirror of + // Java's testResetClearsAllState. + var br qwpBitReader + br.reset([]byte{0xAB, 0xCD}) + if _, err := br.readBits(10); err != nil { + t.Fatalf("readBits(10): %v", err) + } + if br.bitsRead != 10 { + t.Fatalf("bitsRead after first run = %d, want 10", br.bitsRead) + } + + br.reset([]byte{0x12, 0x34}) + if br.bitsRead != 0 { + t.Fatalf("bitsRead after reset = %d, want 0", br.bitsRead) + } + if br.bitsAvail != 0 || br.bitBuffer != 0 || br.pos != 0 { + t.Fatalf("residual state after reset: bitsAvail=%d bitBuffer=%#x pos=%d", + br.bitsAvail, br.bitBuffer, br.pos) + } + got, err := br.readBits(8) + if err != nil { + t.Fatalf("readBits(8) after reset: %v", err) + } + if got != 0x12 { + t.Fatalf("readBits(8) after reset = %#x, want 0x12", got) + } + if br.bitsRead != 8 { + t.Fatalf("bitsRead = %d, want 8", br.bitsRead) + } +} + +// --- qwpGorillaDecoder --- + +func TestQwpGorillaDecoderBitPositionAfterDecode(t *testing.T) { + // For a constant-delta series (every DoD = 0), each non-prefix + // value contributes exactly 1 bit to the stream. The pre-computed + // encoder size must match the decoder's bytesConsumed(). + ts := []int64{100, 200, 300, 400, 500, 600, 700, 800, 900, 1000} + src := intsToBytes(ts) + preSize := qwpGorillaEncodedSize(src, len(ts)) + + var wb qwpWireBuffer + var enc qwpGorillaEncoder + n := enc.encodeTimestamps(&wb, src, len(ts)) + if n != preSize { + t.Fatalf("encoder size %d != pre-computed %d", n, preSize) + } + + var dec qwpGorillaDecoder + dec.reset(ts[0], ts[1], wb.bytes()[16:]) + for i := 2; i < len(ts); i++ { + if _, err := dec.decodeNext(); err != nil { + t.Fatalf("decodeNext[%d]: %v", i, err) + } + } + // Total stream length after the 16-byte prefix must equal the + // decoder's accounting. + wantTrailer := len(wb.bytes()) - 16 + if got := dec.bytesConsumed(); got != wantTrailer { + t.Fatalf("bytesConsumed = %d, want %d", got, wantTrailer) + } +} + +func TestQwpGorillaDecoderTruncatedBitstream(t *testing.T) { + // Encode a series that needs a wide DoD bucket so the bitstream is + // long enough to lose bytes from. Then chop the final byte and + // decode — at some point the reader must error. + ts := []int64{ + 0, + 1_000_000, + 3_000_000, + 3_000_001, + 3_000_002, + 3_000_003, + } + src := intsToBytes(ts) + var wb qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb, src, len(ts)) + truncated := wb.bytes()[:len(wb.bytes())-1] + if len(truncated) < 16 { + t.Fatalf("truncated smaller than prefix: %d bytes", len(truncated)) + } + + var dec qwpGorillaDecoder + dec.reset(ts[0], ts[1], truncated[16:]) + var err error + for i := 2; i < len(ts); i++ { + _, err = dec.decodeNext() + if err != nil { + break + } + } + if err == nil { + t.Fatalf("expected error from truncated bitstream") + } + var de *qwpDecodeError + if !errors.As(err, &de) { + t.Fatalf("expected *qwpDecodeError, got %T", err) + } +} + +func TestQwpGorillaDecoderRoundTripAllBuckets(t *testing.T) { + // Drive one roundtrip per DoD bucket to confirm the decoder handles + // every prefix branch. Distinct from the encoder-side boundary + // tests — those only ensure the encoder emits correct bits. This + // test specifically exercises the production decoder's prefix tree. + cases := []struct { + name string + dod int64 + }{ + {"bucket0", 0}, + {"bucket1_pos", 5}, + {"bucket1_neg", -10}, + {"bucket2_pos", 100}, + {"bucket2_neg", -200}, + {"bucket3_pos", 500}, + {"bucket3_neg", -1500}, + {"bucket4_pos", 100_000}, + {"bucket4_neg", -500_000}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + ts := []int64{0, 10_000, 10_000 + 10_000 + c.dod} + src := intsToBytes(ts) + var wb qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb, src, len(ts)) + + var dec qwpGorillaDecoder + dec.reset(ts[0], ts[1], wb.bytes()[16:]) + got, err := dec.decodeNext() + if err != nil { + t.Fatalf("decodeNext: %v", err) + } + if got != ts[2] { + t.Fatalf("decoded %d, want %d", got, ts[2]) + } + }) + } +} + +func TestQwpGorillaDecoderRoundTripRandom(t *testing.T) { + // Analogue of the encoder-side random round-trip: drives several + // buckets in one bitstream and confirms decoder state threading + // (prevTs, prevDelta) is correct across all of them. + r := rand.New(rand.NewSource(0xBADFACE)) + ts := make([]int64, 256) + cur := int64(0) + delta := int64(1000) + for i := range ts { + delta += int64(r.Intn(2001) - 1000) + cur += delta + ts[i] = cur + } + src := intsToBytes(ts) + var wb qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb, src, len(ts)) + + var dec qwpGorillaDecoder + dec.reset(ts[0], ts[1], wb.bytes()[16:]) + for i := 2; i < len(ts); i++ { + got, err := dec.decodeNext() + if err != nil { + t.Fatalf("decodeNext[%d]: %v", i, err) + } + if got != ts[i] { + t.Fatalf("ts[%d] = %d, want %d", i, got, ts[i]) + } + } +} + +func TestQwpGorillaDecoderDecodePastEndOfEmptyBitstream(t *testing.T) { + // Reset to a zero-length bitstream and verify decodeNext surfaces + // the bit reader's "past end of buffer" error on the very first + // call. Asking for a value when there are no bytes at all is the + // unambiguous past-end case (a trailing-zero pattern would + // resemble a valid 1-bit "DoD == 0" prefix). Mirror of Java's + // testDecodePastEndOfEmptyBitstreamThrows. + var dec qwpGorillaDecoder + dec.reset(0, 100, nil) + _, err := dec.decodeNext() + if err == nil { + t.Fatalf("decodeNext on empty bitstream must error") + } + var de *qwpDecodeError + if !errors.As(err, &de) { + t.Fatalf("expected *qwpDecodeError, got %T: %v", err, err) + } +} + +func TestQwpGorillaDecoderDecodePastEndOfLargeBucketBitstream(t *testing.T) { + // Encode a sequence whose DoDs land in the 36-bit fallback bucket + // (each emitted value consumes a known multi-byte chunk). After + // decoding the encoded values, keep asking for more until the + // past-end check fires. The cap is generous (64 spurious calls) + // — the trailing bit pattern of the last byte determines exactly + // when the reader runs out of payload, so we loop until it does. + // Mirror of Java's testDecodePastEndOfLargeBucketBitstreamThrows. + ts := []int64{1_000_000, 2_000_000, 3_500_000, 7_000_000} + src := intsToBytes(ts) + var wb qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb, src, len(ts)) + + var dec qwpGorillaDecoder + dec.reset(ts[0], ts[1], wb.bytes()[16:]) + for i := 2; i < len(ts); i++ { + got, err := dec.decodeNext() + if err != nil { + t.Fatalf("decodeNext[%d]: %v", i, err) + } + if got != ts[i] { + t.Fatalf("decodeNext[%d] = %d, want %d", i, got, ts[i]) + } + } + + var seenErr error + for i := 0; i < 64; i++ { + if _, err := dec.decodeNext(); err != nil { + seenErr = err + break + } + } + if seenErr == nil { + t.Fatalf("decodeNext past end must eventually error") + } + var de *qwpDecodeError + if !errors.As(seenErr, &de) { + t.Fatalf("expected *qwpDecodeError past end, got %T: %v", seenErr, seenErr) + } +} + +func TestQwpGorillaDecoderResetClearsResidualState(t *testing.T) { + // After one decode run, a fresh reset must zero the bit buffer, + // bitsAvail, and pos — residual bits from the first stream would + // otherwise prepend garbage to the second decode. + ts1 := []int64{100, 200, 300, 400} + ts2 := []int64{1_000_000, 1_000_005, 1_000_015, 1_000_030} + var wb1, wb2 qwpWireBuffer + var enc qwpGorillaEncoder + enc.encodeTimestamps(&wb1, intsToBytes(ts1), len(ts1)) + enc.encodeTimestamps(&wb2, intsToBytes(ts2), len(ts2)) + + var dec qwpGorillaDecoder + // Run 1: decode two values so the bit buffer is non-empty at exit. + dec.reset(ts1[0], ts1[1], wb1.bytes()[16:]) + if _, err := dec.decodeNext(); err != nil { + t.Fatalf("run1 decodeNext[2]: %v", err) + } + // Run 2: reset to the new bitstream and verify the full sequence. + dec.reset(ts2[0], ts2[1], wb2.bytes()[16:]) + for i := 2; i < len(ts2); i++ { + got, err := dec.decodeNext() + if err != nil { + t.Fatalf("run2 decodeNext[%d]: %v", i, err) + } + if got != ts2[i] { + t.Fatalf("ts2[%d] = %d, want %d", i, got, ts2[i]) + } + } +} diff --git a/qwp_gorilla_test.go b/qwp_gorilla_test.go index a73f8a70..356dbb1f 100644 --- a/qwp_gorilla_test.go +++ b/qwp_gorilla_test.go @@ -360,8 +360,10 @@ func assertRoundTrip(t *testing.T, ts []int64) { } } -// decodeGorilla mirrors QwpGorillaDecoder + QwpBitReader from the Java -// reference. Used only in tests to validate the encoder's output. +// decodeGorilla delegates to the production qwpGorillaDecoder so the +// existing encoder tests double as decoder round-trip coverage. Errors +// from the production decoder are turned into t.Fatalf here because +// the encoder-side tests do not set up hostile inputs. func decodeGorilla(t *testing.T, data []byte, count int) []int64 { t.Helper() if count == 0 { @@ -384,87 +386,14 @@ func decodeGorilla(t *testing.T, data []byte, count int) []int64 { if count == 2 { return out } - br := &testBitReader{data: data[16:]} - prevTs := ts1 - prevDelta := ts1 - ts0 + var dec qwpGorillaDecoder + dec.reset(ts0, ts1, data[16:]) for i := 2; i < count; i++ { - dod := decodeDoD(t, br) - delta := prevDelta + dod - ts := prevTs + delta + ts, err := dec.decodeNext() + if err != nil { + t.Fatalf("decodeNext at i=%d: %v", i, err) + } out = append(out, ts) - prevDelta = delta - prevTs = ts } return out } - -func decodeDoD(t *testing.T, br *testBitReader) int64 { - t.Helper() - if br.readBit(t) == 0 { - return 0 - } - if br.readBit(t) == 0 { - return br.readSigned(t, 7) - } - if br.readBit(t) == 0 { - return br.readSigned(t, 9) - } - if br.readBit(t) == 0 { - return br.readSigned(t, 12) - } - return br.readSigned(t, 32) -} - -// testBitReader is an LSB-first bit reader matching QwpBitReader. -type testBitReader struct { - data []byte - bitBuffer uint64 - bitsAvail int - pos int -} - -func (r *testBitReader) readBit(t *testing.T) uint64 { - t.Helper() - return r.readBits(t, 1) -} - -func (r *testBitReader) readBits(t *testing.T, n int) uint64 { - t.Helper() - var result uint64 - shift := 0 - for n > 0 { - if r.bitsAvail == 0 { - if r.pos >= len(r.data) { - t.Fatalf("bit read overflow") - } - r.bitBuffer = uint64(r.data[r.pos]) - r.pos++ - r.bitsAvail = 8 - } - take := n - if take > r.bitsAvail { - take = r.bitsAvail - } - var mask uint64 - if take == 64 { - mask = ^uint64(0) - } else { - mask = (uint64(1) << take) - 1 - } - result |= (r.bitBuffer & mask) << shift - r.bitBuffer >>= take - r.bitsAvail -= take - shift += take - n -= take - } - return result -} - -func (r *testBitReader) readSigned(t *testing.T, n int) int64 { - t.Helper() - u := r.readBits(t, n) - if n < 64 && u&(uint64(1)<<(n-1)) != 0 { - u |= ^uint64(0) << n - } - return int64(u) -} diff --git a/qwp_host_tracker.go b/qwp_host_tracker.go new file mode 100644 index 00000000..fc8ed30c --- /dev/null +++ b/qwp_host_tracker.go @@ -0,0 +1,460 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "strings" + "sync" +) + +// qwpHostState classifies a host's last-observed connect outcome. +// Lower state-priority values win in PickNext's lexicographic +// comparison (see failover.md §2). +type qwpHostState byte + +const ( + // qwpHostHealthy: last connect to this host succeeded. Priority 1. + qwpHostHealthy qwpHostState = iota + // qwpHostUnknown: never tried this round, or just reset by + // BeginRound(forgetClassifications=true). Priority 2. + qwpHostUnknown + // qwpHostTransientReject: server returned 421 + + // X-QuestDB-Role: PRIMARY_CATCHUP. Likely to recover; priority 3. + qwpHostTransientReject + // qwpHostTransportError: TCP/TLS/handshake error during connect, + // or mid-stream send/recv failure recorded via + // RecordMidStreamFailure. Priority 4. + qwpHostTransportError + // qwpHostTopologyReject: server returned 421 + X-QuestDB-Role + // other than PRIMARY_CATCHUP (or `target=` mismatch on the role + // table). Will not become writable without a topology change. + // Priority 5 (worst). + qwpHostTopologyReject +) + +// statePriority returns the spec-defined priority of a state. +// Lower is better. Unrecognized states return a sentinel that loses +// every comparison. +func (s qwpHostState) priority() int { + switch s { + case qwpHostHealthy: + return 1 + case qwpHostUnknown: + return 2 + case qwpHostTransientReject: + return 3 + case qwpHostTransportError: + return 4 + case qwpHostTopologyReject: + return 5 + } + return 99 +} + +// String returns the spec-doc name of the state for diagnostics. +func (s qwpHostState) String() string { + switch s { + case qwpHostHealthy: + return "Healthy" + case qwpHostUnknown: + return "Unknown" + case qwpHostTransientReject: + return "TransientReject" + case qwpHostTransportError: + return "TransportError" + case qwpHostTopologyReject: + return "TopologyReject" + } + return "Invalid" +} + +// qwpZoneTier classifies a host's zone relative to the client's +// configured `zone=` value. Assignment happens via RecordZone, fed +// from either SERVER_INFO.zone_id (post-upgrade) or X-QuestDB-Zone +// (upgrade reject). +type qwpZoneTier byte + +const ( + // qwpZoneSame: server zone equals client zone (case-insensitive), + // OR client zone is unset, OR target=primary (writers must follow + // the master regardless of geography). Priority 1. + qwpZoneSame qwpZoneTier = iota + // qwpZoneUnknown: server did not advertise a zone (no CAP_ZONE, + // no X-QuestDB-Zone header, or the client did not consume + // SERVER_INFO). Priority 2. + qwpZoneUnknown + // qwpZoneOther: server advertised a zone that differs from the + // client's `zone=`. Priority 3 (worst). Only reachable when the + // client has an explicit zone and target != primary. + qwpZoneOther +) + +// priority returns the spec-defined zone tier priority. Lower is +// better; ordering is `Same` < `Unknown` < `Other`. +func (z qwpZoneTier) priority() int { + switch z { + case qwpZoneSame: + return 1 + case qwpZoneUnknown: + return 2 + case qwpZoneOther: + return 3 + } + return 99 +} + +// String returns the spec-doc name of the tier for diagnostics. +func (z qwpZoneTier) String() string { + switch z { + case qwpZoneSame: + return "Same" + case qwpZoneUnknown: + return "Unknown" + case qwpZoneOther: + return "Other" + } + return "Invalid" +} + +// qwpHostEntry is the per-host tracker slot. The `attempted` bit is +// reset at every BeginRound; state and zoneTier persist across rounds +// unless explicitly cleared (BeginRound(forgetClassifications=true)). +type qwpHostEntry struct { + state qwpHostState + zoneTier qwpZoneTier + attempted bool +} + +// qwpHostTracker implements the failover.md §2 host-health model: +// each configured `addr=` entry carries a `(state, zone_tier)` +// classification and a per-round `attempted` bit. PickNext returns +// the lexicographically-best unattempted entry. +// +// The tracker is shared across loops (foreground SF I/O thread, +// orphan drainers, etc.); per-caller demotion state (e.g. the +// `previousIdx` slot used to drive RecordMidStreamFailure on the +// next iteration) lives on the *caller*, not on the tracker. See +// failover.md §2.3 "Per-caller previousIdx, not shared". +// +// All methods are safe for concurrent use; a single internal mutex +// serializes every operation. The public API is not required to be +// re-entrant. +type qwpHostTracker struct { + mu sync.Mutex + + // hosts is the per-endpoint slot table. len(hosts) matches the + // configured addr= list and never changes for the tracker's + // lifetime. + hosts []qwpHostEntry + + // clientZone is the trimmed, lowercased value of the + // connect-string `zone=` key. Empty when the user did not + // configure a zone (including whitespace-only values, which + // collapse to "" after the constructor's TrimSpace). + clientZone string + + // target collapses zone tiers to Same when set to + // qwpTargetPrimary (writers must follow the master regardless + // of geography). Other target values leave zone-tier assignment + // to RecordZone. + target QwpTargetFilter +} + +// newQwpHostTracker constructs a tracker for `numHosts` configured +// endpoints. The initial state of every host is `Unknown` (never +// observed); the initial zone tier depends on the client config: +// +// - Same when target=primary or the client zone is unset. No zone +// observation is needed in these cases — the tier collapses for +// all hosts. +// - Unknown otherwise. RecordZone fills in Same/Other once the +// transport observes a server zone for the host. +// +// clientZone is case-insensitive and whitespace-insensitive (stored +// trimmed + lowercased); pass "" when the user did not configure +// one. A whitespace-only value collapses to "" here so the +// zone-blind shortcut applies — symmetric with RecordZone, which +// trims server-side zone observations. numHosts must be > 0; the +// caller is responsible for validation (sanitizeQwpConf rejects an +// empty endpoint list before reaching this point). +func newQwpHostTracker(numHosts int, clientZone string, target QwpTargetFilter) *qwpHostTracker { + t := &qwpHostTracker{ + hosts: make([]qwpHostEntry, numHosts), + clientZone: strings.ToLower(strings.TrimSpace(clientZone)), + target: target, + } + initialZone := qwpZoneUnknown + if t.zoneCollapsedToSame() { + initialZone = qwpZoneSame + } + for i := range t.hosts { + t.hosts[i] = qwpHostEntry{ + state: qwpHostUnknown, + zoneTier: initialZone, + } + } + return t +} + +// zoneCollapsedToSame reports whether every host's zone tier +// collapses to Same regardless of observed server zone. Holds when +// target=primary (writers follow the master) or the client did not +// configure a zone (zone-blind). Does not require the lock; reads +// only immutable fields set at construction. +func (t *qwpHostTracker) zoneCollapsedToSame() bool { + return t.target == qwpTargetPrimary || t.clientZone == "" +} + +// Len returns the number of hosts the tracker manages. Exposed +// mainly so callers can size their own per-caller previousIdx slots +// to match the addr= list. +func (t *qwpHostTracker) Len() int { + return len(t.hosts) +} + +// PickNext returns the index of the highest-priority unattempted +// host, or -1 if the round is exhausted. Selection is +// lexicographic on (state_priority, zone_priority); ties go to the +// lower index (i.e. the order in which the user supplied addr=). +// +// Calling PickNext twice without an intervening BeginRound is +// permitted on a non-exhausted tracker — the result is deterministic +// and idempotent because PickNext does not mutate `attempted`. The +// caller is responsible for invoking the appropriate Record* method +// before the next selection so the same host isn't returned again. +func (t *qwpHostTracker) PickNext() int { + t.mu.Lock() + defer t.mu.Unlock() + best := -1 + bestStatePri := 0 + bestZonePri := 0 + for i := range t.hosts { + h := &t.hosts[i] + if h.attempted { + continue + } + sp := h.state.priority() + zp := h.zoneTier.priority() + if best == -1 || sp < bestStatePri || (sp == bestStatePri && zp < bestZonePri) { + best = i + bestStatePri = sp + bestZonePri = zp + } + } + return best +} + +// IsRoundExhausted reports whether every host has been attempted in +// the current round. The reconnect loop calls this between +// PickNext == -1 and BeginRound to confirm the exhaustion path — +// useful for diagnostics; correctness only requires the PickNext +// return value. +func (t *qwpHostTracker) IsRoundExhausted() bool { + t.mu.Lock() + defer t.mu.Unlock() + for i := range t.hosts { + if !t.hosts[i].attempted { + return false + } + } + return true +} + +// RecordSuccess marks host idx as Healthy and consumes its round +// slot. Previously-Healthy hosts (at other indices) are NOT +// implicitly demoted — the sticky-Healthy effect emerges at +// BeginRound(forgetClassifications=true). Out-of-range idx is a +// silent no-op so callers can pass a stored previousIdx without a +// defensive bounds check. +func (t *qwpHostTracker) RecordSuccess(idx int) { + t.mu.Lock() + defer t.mu.Unlock() + if idx < 0 || idx >= len(t.hosts) { + return + } + t.hosts[idx].state = qwpHostHealthy + t.hosts[idx].attempted = true +} + +// RecordRoleReject classifies a 421 + role response. When +// transient is true (role == PRIMARY_CATCHUP), the host enters +// TransientReject and gets another chance on the next +// BeginRound(forgetClassifications=true); when false (any other +// non-empty role), it enters TopologyReject and stays at the +// lowest priority until the operator confirms cluster health. +// Both outcomes consume the round slot. +func (t *qwpHostTracker) RecordRoleReject(idx int, transient bool) { + t.mu.Lock() + defer t.mu.Unlock() + if idx < 0 || idx >= len(t.hosts) { + return + } + if transient { + t.hosts[idx].state = qwpHostTransientReject + } else { + t.hosts[idx].state = qwpHostTopologyReject + } + t.hosts[idx].attempted = true +} + +// RecordTransportError marks a host as TransportError after a +// TCP/TLS/handshake failure during connect. Consumes the round +// slot. Mid-stream send/recv failures (after a successful upgrade) +// go through RecordMidStreamFailure instead. +func (t *qwpHostTracker) RecordTransportError(idx int) { + t.mu.Lock() + defer t.mu.Unlock() + if idx < 0 || idx >= len(t.hosts) { + return + } + t.hosts[idx].state = qwpHostTransportError + t.hosts[idx].attempted = true +} + +// RecordMidStreamFailure demotes a Healthy host to TransportError +// after the receive or send pump throws past a successful upgrade. +// Does NOT touch `attempted` — the caller passes its private +// previousIdx slot and we want the next PickNext to consider the +// newly-demoted host as one of the candidates in the same round. +// Non-Healthy entries are left alone; if a drainer already +// observed a TopologyReject on this index, foreground's mid-stream +// failure should not undo that classification. +// +// The reconnect-loop ordering invariant (failover.md §2.3) is: +// call RecordMidStreamFailure BEFORE the next PickNext / BeginRound. +// Reversing the order makes sticky-Healthy preserve the just-failed +// host as priority pick, which then receives the first reconnect +// attempt and fails again. +func (t *qwpHostTracker) RecordMidStreamFailure(idx int) { + t.mu.Lock() + defer t.mu.Unlock() + if idx < 0 || idx >= len(t.hosts) { + return + } + if t.hosts[idx].state == qwpHostHealthy { + t.hosts[idx].state = qwpHostTransportError + } +} + +// RecordZone updates a host's zone tier from an observed server +// zone identifier. Inputs follow the spec: +// +// - zoneId == "" (or whitespace-only): no-op; the existing tier +// is preserved. This covers servers that did not emit a zone +// header (servers without CAP_ZONE, or a 421 reject without +// X-QuestDB-Zone). The tracker's initial tier remains in effect. +// - zoneId == client zone (case-insensitive): tier becomes Same. +// - target=primary or client zone unset: tier becomes Same +// regardless of the zoneId value (the spec collapses zone tiers +// in these modes; writers must follow the master). +// - otherwise: tier becomes Other. +// +// Does NOT touch state or attempted — zone observation is +// orthogonal to state classification and may happen on the same +// connect attempt that also records success / role-reject / +// transport-error. +func (t *qwpHostTracker) RecordZone(idx int, zoneId string) { + trimmed := strings.TrimSpace(zoneId) + if trimmed == "" { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if idx < 0 || idx >= len(t.hosts) { + return + } + if t.zoneCollapsedToSame() { + t.hosts[idx].zoneTier = qwpZoneSame + return + } + if strings.EqualFold(trimmed, t.clientZone) { + t.hosts[idx].zoneTier = qwpZoneSame + } else { + t.hosts[idx].zoneTier = qwpZoneOther + } +} + +// BeginRound clears the per-round attempted flags. When +// forgetClassifications is true, additionally: +// +// - Resets every non-Healthy state to Unknown so stale +// TransientReject / TopologyReject / TransportError entries get +// another chance. +// - Preserves the LAST Healthy entry whose zone tier is Same as +// the sticky-Healthy pin. Any earlier same-zone Healthy entry, +// and any cross-zone (Other) Healthy entry, is reset to Unknown +// — a sticky pin in another zone would otherwise lock the +// client out of probing local hosts after they recover. +// - Zone tiers are NOT cleared — once observed, they persist for +// the host's lifetime in this client until re-observed. +// +// forgetClassifications=true is the between-outages reset; false +// is the within-outage reset (same round bits cleared, +// classifications preserved). +func (t *qwpHostTracker) BeginRound(forgetClassifications bool) { + t.mu.Lock() + defer t.mu.Unlock() + for i := range t.hosts { + t.hosts[i].attempted = false + } + if !forgetClassifications { + return + } + // Find the LAST Healthy entry with Same zone tier — preserve that + // one and only that one. A later same-zone Healthy supersedes any + // earlier one; cross-zone (Other) Healthy entries are not + // preserved at all. + stickyIdx := -1 + for i := range t.hosts { + if t.hosts[i].state == qwpHostHealthy && t.hosts[i].zoneTier == qwpZoneSame { + stickyIdx = i + } + } + for i := range t.hosts { + if i == stickyIdx { + continue + } + // Reset every non-Unknown state to Unknown. This covers: + // - All Healthy entries that aren't the sticky (earlier + // same-zone Healthy, or cross-zone Healthy). + // - All TransientReject / TopologyReject / TransportError + // entries (give them another chance next round). + if t.hosts[i].state != qwpHostUnknown { + t.hosts[i].state = qwpHostUnknown + } + } +} + +// snapshot returns a copy of the host-entry slice. Test-only; +// callers must not mutate the returned slice (it shares no memory +// with the tracker, but the contract is "observation, not +// influence"). +func (t *qwpHostTracker) snapshot() []qwpHostEntry { + t.mu.Lock() + defer t.mu.Unlock() + out := make([]qwpHostEntry, len(t.hosts)) + copy(out, t.hosts) + return out +} diff --git a/qwp_host_tracker_test.go b/qwp_host_tracker_test.go new file mode 100644 index 00000000..e543fdaf --- /dev/null +++ b/qwp_host_tracker_test.go @@ -0,0 +1,583 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "sync" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --- Construction & initial state --- + +// TestQwpHostTrackerInitialStateZoneUnset confirms that, when the +// client did not configure a zone, every host starts Unknown/Same — +// the spec's zone-blind shortcut. PickNext then becomes a pure +// state-priority race. +func TestQwpHostTrackerInitialStateZoneUnset(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + for i, h := range tr.snapshot() { + assert.Equal(t, qwpHostUnknown, h.state, "host %d initial state", i) + assert.Equal(t, qwpZoneSame, h.zoneTier, "host %d zoneTier (client zone unset → Same)", i) + assert.False(t, h.attempted, "host %d attempted", i) + } +} + +// TestQwpHostTrackerInitialStateTargetPrimary verifies that +// target=primary collapses every host's initial tier to Same, even +// when the client has an explicit zone. Writers must follow the +// master regardless of geography (failover.md §2). +func TestQwpHostTrackerInitialStateTargetPrimary(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetPrimary) + for i, h := range tr.snapshot() { + assert.Equal(t, qwpZoneSame, h.zoneTier, "host %d zoneTier (target=primary collapse)", i) + } +} + +// TestQwpHostTrackerInitialStateZoneAware verifies that when the +// client has an explicit zone and target!=primary, the initial tier +// is Unknown until RecordZone fills it in. +func TestQwpHostTrackerInitialStateZoneAware(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + for i, h := range tr.snapshot() { + assert.Equal(t, qwpZoneUnknown, h.zoneTier, "host %d zoneTier", i) + } +} + +// TestQwpHostTrackerInitialStateZoneWhitespaceOnly confirms that a +// whitespace-only client zone collapses to the zone-unset shortcut. +// Without the constructor's TrimSpace the tracker would treat every +// observed server zone as Other (since EqualFold(" ", any) is +// false), breaking zone-locality for users who accidentally pass +// `zone= `. +func TestQwpHostTrackerInitialStateZoneWhitespaceOnly(t *testing.T) { + tr := newQwpHostTracker(2, " \t ", qwpTargetAny) + for i, h := range tr.snapshot() { + assert.Equal(t, qwpZoneSame, h.zoneTier, "host %d zoneTier (whitespace zone → unset → Same)", i) + } + // Subsequent RecordZone observations must also collapse to Same, + // not Other — proves the trim sticks beyond the initial tier. + tr.RecordZone(0, "us-east-1a") + tr.RecordZone(1, "eu-west-1a") + for i, h := range tr.snapshot() { + assert.Equal(t, qwpZoneSame, h.zoneTier, "host %d zoneTier after RecordZone", i) + } +} + +// TestQwpHostTrackerLen reports the configured host count. +func TestQwpHostTrackerLen(t *testing.T) { + assert.Equal(t, 3, newQwpHostTracker(3, "", qwpTargetAny).Len()) + assert.Equal(t, 0, newQwpHostTracker(0, "", qwpTargetAny).Len()) +} + +// --- PickNext basic walk --- + +// TestQwpHostTrackerPickNextWalksInOrder verifies that on a fresh +// round with all Unknown hosts, PickNext returns 0, then 1, etc. +// after each is recorded. Tie-breaks go to the lower index. +func TestQwpHostTrackerPickNextWalksInOrder(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + assert.Equal(t, 0, tr.PickNext()) + tr.RecordTransportError(0) + assert.Equal(t, 1, tr.PickNext()) + tr.RecordTransportError(1) + assert.Equal(t, 2, tr.PickNext()) + tr.RecordTransportError(2) + assert.Equal(t, -1, tr.PickNext(), "round must exhaust after every host attempted") + assert.True(t, tr.IsRoundExhausted()) +} + +// TestQwpHostTrackerPickNextEmpty edge case: a tracker with zero +// hosts must immediately report -1 / exhausted without panicking. +func TestQwpHostTrackerPickNextEmpty(t *testing.T) { + tr := newQwpHostTracker(0, "", qwpTargetAny) + assert.Equal(t, -1, tr.PickNext()) + assert.True(t, tr.IsRoundExhausted()) +} + +// TestQwpHostTrackerPickNextSkipsAttempted: once an entry is +// attempted (regardless of outcome), PickNext must skip it within +// the same round. +func TestQwpHostTrackerPickNextSkipsAttempted(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + tr.RecordSuccess(1) // priority 1 (Healthy) but already attempted + // Both 0 and 2 remain Unknown (priority 2), unattempted. The + // lower index wins the tie. + assert.Equal(t, 0, tr.PickNext()) +} + +// --- State priority ordering --- + +// TestQwpHostTrackerStatePriorityOrdering walks the full state +// lattice: with five hosts in distinct states, PickNext must visit +// them in Healthy → Unknown → TransientReject → TransportError → +// TopologyReject order. +func TestQwpHostTrackerStatePriorityOrdering(t *testing.T) { + tr := newQwpHostTracker(5, "", qwpTargetAny) + // Force-install distinct states across the five hosts. The + // public API only mutates state via Record* (which also sets + // attempted), so we go through snapshot+reset for the test + // scaffolding instead of poking the internal slice directly: + // record a "fake" round to install the states, then BeginRound + // to clear attempted while preserving classification (no + // forget). + tr.RecordTransportError(0) // host 0 → TransportError + tr.RecordRoleReject(1, true) // host 1 → TransientReject + tr.RecordRoleReject(2, false) // host 2 → TopologyReject + tr.RecordSuccess(3) // host 3 → Healthy + // host 4 stays Unknown. + tr.BeginRound(false) + + // Best state is Healthy (3), then Unknown (4), then + // TransientReject (1), then TransportError (0), then + // TopologyReject (2). + expectOrder := []int{3, 4, 1, 0, 2} + for step, want := range expectOrder { + got := tr.PickNext() + require.Equalf(t, want, got, "step %d: expected host %d", step, want) + tr.RecordTransportError(got) // consume the round slot + } + assert.Equal(t, -1, tr.PickNext()) +} + +// --- Zone priority ordering --- + +// TestQwpHostTrackerZonePriorityOrdering: with all states equal, +// zone tier breaks the tie. Same < Unknown < Other. +func TestQwpHostTrackerZonePriorityOrdering(t *testing.T) { + tr := newQwpHostTracker(3, "eu-west-1a", qwpTargetAny) + // All start in Unknown state with Unknown zone tier (since + // client has an explicit zone). Install zones: + // host 0 → Other ("us-east-1a") + // host 1 → Same ("eu-west-1a") + // host 2 → (left as Unknown) + tr.RecordZone(0, "us-east-1a") + tr.RecordZone(1, "eu-west-1a") + // host 2 stays zone=Unknown. + + // All states are Unknown → lexicographic comparison falls to + // zone priority. Order should be: 1 (Same), 2 (Unknown), 0 (Other). + expectOrder := []int{1, 2, 0} + for step, want := range expectOrder { + got := tr.PickNext() + require.Equalf(t, want, got, "step %d: expected host %d", step, want) + tr.RecordTransportError(got) + } +} + +// TestQwpHostTrackerLexicographicStateOverridesZone: state outranks +// zone. An Other-zone Healthy beats a Same-zone Unknown. +func TestQwpHostTrackerLexicographicStateOverridesZone(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "us-east-1a") // host 0 → state=Unknown, zone=Other (priority (2, 3)) + tr.RecordZone(1, "eu-west-1a") // host 1 → state=Unknown, zone=Same (priority (2, 1)) + // Promote host 0 to Healthy so its priority becomes (1, 3). + tr.RecordSuccess(0) + tr.BeginRound(false) + + // Host 0 (1, 3) beats host 1 (2, 1) because state outranks zone. + assert.Equal(t, 0, tr.PickNext()) +} + +// TestQwpHostTrackerTieBreakByListOrder: equal (state, zone) ties go +// to the lower index — matching the user-supplied addr= order. +func TestQwpHostTrackerTieBreakByListOrder(t *testing.T) { + tr := newQwpHostTracker(4, "", qwpTargetAny) + // All Unknown / Same after construction with zone unset. + assert.Equal(t, 0, tr.PickNext()) + tr.RecordTransportError(0) + assert.Equal(t, 1, tr.PickNext()) + tr.RecordTransportError(1) + assert.Equal(t, 2, tr.PickNext()) +} + +// --- RecordZone semantics --- + +// TestQwpHostTrackerRecordZoneEmptyIsNoOp: an empty/whitespace +// zoneId must NOT touch the tier (spec §2.1). +func TestQwpHostTrackerRecordZoneEmptyIsNoOp(t *testing.T) { + tr := newQwpHostTracker(1, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "eu-west-1a") // tier → Same + tr.RecordZone(0, "") // no-op + tr.RecordZone(0, " ") // no-op (whitespace) + assert.Equal(t, qwpZoneSame, tr.snapshot()[0].zoneTier) +} + +// TestQwpHostTrackerRecordZoneCaseInsensitive: comparison against +// client zone is case-insensitive (failover.md §1.1, §5). +func TestQwpHostTrackerRecordZoneCaseInsensitive(t *testing.T) { + tr := newQwpHostTracker(1, "EU-West-1A", qwpTargetAny) + tr.RecordZone(0, "eu-west-1a") + assert.Equal(t, qwpZoneSame, tr.snapshot()[0].zoneTier) + + tr2 := newQwpHostTracker(1, "eu-west-1a", qwpTargetAny) + tr2.RecordZone(0, "EU-WEST-1A") + assert.Equal(t, qwpZoneSame, tr2.snapshot()[0].zoneTier) +} + +// TestQwpHostTrackerRecordZoneTargetPrimaryAlwaysSame: under +// target=primary, even a clearly-different zoneId must yield Same +// (zone tier collapses). +func TestQwpHostTrackerRecordZoneTargetPrimaryAlwaysSame(t *testing.T) { + tr := newQwpHostTracker(1, "eu-west-1a", qwpTargetPrimary) + tr.RecordZone(0, "us-east-1a") + assert.Equal(t, qwpZoneSame, tr.snapshot()[0].zoneTier) +} + +// TestQwpHostTrackerRecordZoneClientUnsetAlwaysSame: when the +// client did not configure a zone, every observed zoneId yields +// Same. The spec's rationale: a zone-blind client has no +// preference, so every host is equally local. +func TestQwpHostTrackerRecordZoneClientUnsetAlwaysSame(t *testing.T) { + tr := newQwpHostTracker(1, "", qwpTargetAny) + tr.RecordZone(0, "us-east-1a") + assert.Equal(t, qwpZoneSame, tr.snapshot()[0].zoneTier) +} + +// TestQwpHostTrackerRecordZoneDoesNotTouchStateOrAttempted: zone +// observation is orthogonal to state / round bookkeeping. +func TestQwpHostTrackerRecordZoneDoesNotTouchStateOrAttempted(t *testing.T) { + tr := newQwpHostTracker(1, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "eu-west-1a") + h := tr.snapshot()[0] + assert.Equal(t, qwpHostUnknown, h.state, "state must remain Unknown") + assert.False(t, h.attempted, "attempted must remain false") +} + +// TestQwpHostTrackerRecordZoneOutOfRangeNoOp: out-of-range idx +// must not panic — the caller may legitimately pass a stale +// previousIdx slot. +func TestQwpHostTrackerRecordZoneOutOfRangeNoOp(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + assert.NotPanics(t, func() { + tr.RecordZone(-1, "x") + tr.RecordZone(42, "x") + }) +} + +// --- Mid-stream demote semantics --- + +// TestQwpHostTrackerMidStreamDemotesHealthyOnly: per failover.md +// §2.1, mid-stream failure demotes Healthy → TransportError but +// must not touch other states (a drainer's earlier TopologyReject +// observation must survive a foreground mid-stream blip). +func TestQwpHostTrackerMidStreamDemotesHealthyOnly(t *testing.T) { + tr := newQwpHostTracker(4, "", qwpTargetAny) + tr.RecordSuccess(0) // host 0 → Healthy + tr.RecordRoleReject(1, true) // host 1 → TransientReject + tr.RecordRoleReject(2, false) // host 2 → TopologyReject + tr.RecordTransportError(3) // host 3 → TransportError (already worst-but-1) + + tr.RecordMidStreamFailure(0) + tr.RecordMidStreamFailure(1) + tr.RecordMidStreamFailure(2) + tr.RecordMidStreamFailure(3) + + snap := tr.snapshot() + assert.Equal(t, qwpHostTransportError, snap[0].state, "Healthy must demote to TransportError") + assert.Equal(t, qwpHostTransientReject, snap[1].state, "TransientReject must be untouched") + assert.Equal(t, qwpHostTopologyReject, snap[2].state, "TopologyReject must be untouched") + assert.Equal(t, qwpHostTransportError, snap[3].state, "already-TransportError must be untouched") +} + +// TestQwpHostTrackerMidStreamDoesNotTouchAttempted: mid-stream +// demotion preserves the round bit so the just-failed host can be +// considered (and skipped) in the same round walk. +func TestQwpHostTrackerMidStreamDoesNotTouchAttempted(t *testing.T) { + tr := newQwpHostTracker(2, "", qwpTargetAny) + tr.RecordSuccess(0) + tr.BeginRound(false) // attempted cleared but state preserved + assert.False(t, tr.snapshot()[0].attempted) + tr.RecordMidStreamFailure(0) + assert.False(t, tr.snapshot()[0].attempted, + "RecordMidStreamFailure must NOT set attempted") +} + +// TestQwpHostTrackerMidStreamOutOfRangeNoOp covers the same defensive +// bounds check as RecordZone for callers passing a stale previousIdx. +func TestQwpHostTrackerMidStreamOutOfRangeNoOp(t *testing.T) { + tr := newQwpHostTracker(2, "", qwpTargetAny) + assert.NotPanics(t, func() { + tr.RecordMidStreamFailure(-1) + tr.RecordMidStreamFailure(99) + }) +} + +// --- BeginRound semantics --- + +// TestQwpHostTrackerBeginRoundClearsAttemptedOnly: with +// forgetClassifications=false, every Record* outcome is preserved +// across the round boundary; only the attempted bits reset. +func TestQwpHostTrackerBeginRoundClearsAttemptedOnly(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + tr.RecordRoleReject(0, false) // host 0 → TopologyReject + tr.RecordRoleReject(1, true) // host 1 → TransientReject + tr.RecordSuccess(2) // host 2 → Healthy + tr.BeginRound(false) + + snap := tr.snapshot() + assert.Equal(t, qwpHostTopologyReject, snap[0].state) + assert.Equal(t, qwpHostTransientReject, snap[1].state) + assert.Equal(t, qwpHostHealthy, snap[2].state) + for i, h := range snap { + assert.False(t, h.attempted, "host %d attempted must be cleared", i) + } +} + +// TestQwpHostTrackerBeginRoundForgetResetsNonHealthy: with +// forgetClassifications=true, TransientReject / TopologyReject / +// TransportError all reset to Unknown for a fresh shot. +func TestQwpHostTrackerBeginRoundForgetResetsNonHealthy(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + tr.RecordRoleReject(0, false) + tr.RecordRoleReject(1, true) + tr.RecordTransportError(2) + tr.BeginRound(true) + for i, h := range tr.snapshot() { + assert.Equal(t, qwpHostUnknown, h.state, "host %d", i) + } +} + +// TestQwpHostTrackerStickyHealthyLastSameZone: with +// forgetClassifications=true, the LAST same-zone Healthy entry is +// preserved; earlier same-zone Healthy entries are reset. +func TestQwpHostTrackerStickyHealthyLastSameZone(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + // All three start Same (zone unset → collapsed). + tr.RecordSuccess(0) + tr.BeginRound(true) // sticky host 0 preserved + + // Now mark host 1 as Healthy too. Both 0 and 1 are Same+Healthy. + tr.RecordSuccess(1) + tr.BeginRound(true) + snap := tr.snapshot() + assert.Equal(t, qwpHostUnknown, snap[0].state, "older same-zone Healthy must reset") + assert.Equal(t, qwpHostHealthy, snap[1].state, "last same-zone Healthy must be preserved") + assert.Equal(t, qwpHostUnknown, snap[2].state) +} + +// TestQwpHostTrackerStickyHealthyCrossZoneReset: a Healthy entry in +// the Other zone must NOT be preserved across BeginRound(true) — a +// sticky pin in another zone would otherwise lock the client out +// of probing local hosts after they recover. +func TestQwpHostTrackerStickyHealthyCrossZoneReset(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "us-east-1a") // host 0 → Other + tr.RecordZone(1, "eu-west-1a") // host 1 → Same + tr.RecordSuccess(0) // host 0 → Healthy + Other + tr.BeginRound(true) + + snap := tr.snapshot() + assert.Equal(t, qwpHostUnknown, snap[0].state, + "cross-zone Healthy must reset, not be preserved as sticky") + assert.Equal(t, qwpZoneOther, snap[0].zoneTier, + "zone tier must persist across BeginRound") + assert.Equal(t, qwpZoneSame, snap[1].zoneTier, + "zone tier must persist across BeginRound (host 1)") +} + +// TestQwpHostTrackerStickyHealthyPicksSameOverOther: when both a +// same-zone Healthy and an other-zone Healthy exist, the same-zone +// one wins the sticky — even if the cross-zone one was recorded +// LATER. (Cross-zone Healthy never wins.) +func TestQwpHostTrackerStickyHealthyPicksSameOverOther(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "eu-west-1a") + tr.RecordZone(1, "us-east-1a") + tr.RecordSuccess(0) // earlier + tr.RecordSuccess(1) // later, but cross-zone — must NOT win sticky + tr.BeginRound(true) + + snap := tr.snapshot() + assert.Equal(t, qwpHostHealthy, snap[0].state, "same-zone Healthy must be preserved") + assert.Equal(t, qwpHostUnknown, snap[1].state, "cross-zone Healthy must reset") +} + +// TestQwpHostTrackerStickyHealthyTargetPrimaryCollapsesToLast: under +// target=primary, every zone tier is Same so the rule degenerates +// to "preserve the last Healthy entry". +func TestQwpHostTrackerStickyHealthyTargetPrimaryCollapsesToLast(t *testing.T) { + tr := newQwpHostTracker(3, "eu-west-1a", qwpTargetPrimary) + tr.RecordZone(0, "eu-west-1a") + tr.RecordZone(1, "us-east-1a") // collapses to Same + tr.RecordSuccess(0) + tr.RecordSuccess(1) // later, also Same after collapse — wins sticky + tr.BeginRound(true) + + snap := tr.snapshot() + assert.Equal(t, qwpHostUnknown, snap[0].state) + assert.Equal(t, qwpHostHealthy, snap[1].state, + "target=primary: last Healthy wins regardless of original zone") +} + +// TestQwpHostTrackerBeginRoundPreservesZoneTier: zone tier must +// survive BeginRound (both variants). Re-observing a different +// zone is the only way to change it. +func TestQwpHostTrackerBeginRoundPreservesZoneTier(t *testing.T) { + tr := newQwpHostTracker(2, "eu-west-1a", qwpTargetAny) + tr.RecordZone(0, "eu-west-1a") + tr.RecordZone(1, "us-east-1a") + tr.BeginRound(false) + tr.BeginRound(true) + snap := tr.snapshot() + assert.Equal(t, qwpZoneSame, snap[0].zoneTier) + assert.Equal(t, qwpZoneOther, snap[1].zoneTier) +} + +// TestQwpHostTrackerStickyHealthyHonoursSelectionPriority: after a +// BeginRound(true) preserves a sticky-Healthy host, PickNext picks +// it first (priority 1). +func TestQwpHostTrackerStickyHealthyHonoursSelectionPriority(t *testing.T) { + tr := newQwpHostTracker(3, "", qwpTargetAny) + tr.RecordTransportError(0) + tr.RecordTransportError(1) + tr.RecordSuccess(2) + tr.BeginRound(true) // host 2 preserved as sticky-Healthy + + assert.Equal(t, 2, tr.PickNext(), + "sticky-Healthy host must be returned first on the next round") +} + +// --- Out-of-range tolerance --- + +// TestQwpHostTrackerOutOfRangeNoOp covers the bounds-check +// contracts for the Record* operations a caller might invoke with +// a stale or default previousIdx (e.g. -1 / Len()+1). +func TestQwpHostTrackerOutOfRangeNoOp(t *testing.T) { + tr := newQwpHostTracker(2, "", qwpTargetAny) + assert.NotPanics(t, func() { + tr.RecordSuccess(-1) + tr.RecordSuccess(99) + tr.RecordRoleReject(-1, true) + tr.RecordRoleReject(99, false) + tr.RecordTransportError(-1) + tr.RecordTransportError(99) + tr.RecordMidStreamFailure(-1) + tr.RecordMidStreamFailure(99) + tr.RecordZone(-1, "x") + tr.RecordZone(99, "x") + }) + // State must be untouched on the in-range hosts. + for i, h := range tr.snapshot() { + assert.Equal(t, qwpHostUnknown, h.state, "host %d", i) + } +} + +// --- Concurrency --- + +// TestQwpHostTrackerConcurrentAccess hammers every operation from +// multiple goroutines and verifies that (a) no race triggers under +// -race and (b) the final state is internally consistent (every +// host has a valid state / zone tier and the round-exhausted +// predicate matches a manual scan of attempted bits). +func TestQwpHostTrackerConcurrentAccess(t *testing.T) { + const ( + numHosts = 8 + numWorkers = 16 + opsPerLoop = 500 + ) + tr := newQwpHostTracker(numHosts, "eu-west-1a", qwpTargetAny) + var counter atomic.Int64 + var wg sync.WaitGroup + wg.Add(numWorkers) + for w := 0; w < numWorkers; w++ { + go func(seed int) { + defer wg.Done() + for i := 0; i < opsPerLoop; i++ { + idx := (seed + i) % numHosts + switch (seed + i) % 7 { + case 0: + tr.RecordSuccess(idx) + case 1: + tr.RecordRoleReject(idx, true) + case 2: + tr.RecordRoleReject(idx, false) + case 3: + tr.RecordTransportError(idx) + case 4: + tr.RecordMidStreamFailure(idx) + case 5: + tr.RecordZone(idx, "eu-west-1a") + case 6: + if (seed+i)%2 == 0 { + tr.BeginRound(false) + } else { + tr.BeginRound(true) + } + } + _ = tr.PickNext() + _ = tr.IsRoundExhausted() + counter.Add(1) + } + }(w) + } + wg.Wait() + assert.Equal(t, int64(numWorkers*opsPerLoop), counter.Load()) + + // Post-hoc consistency: every entry must hold a valid state + + // zone tier value. The exact final classification is + // non-deterministic. + for i, h := range tr.snapshot() { + assert.GreaterOrEqual(t, h.state.priority(), 1, "host %d state=%v", i, h.state) + assert.LessOrEqual(t, h.state.priority(), 5, "host %d state=%v", i, h.state) + assert.GreaterOrEqual(t, h.zoneTier.priority(), 1, "host %d zone=%v", i, h.zoneTier) + assert.LessOrEqual(t, h.zoneTier.priority(), 3, "host %d zone=%v", i, h.zoneTier) + } +} + +// --- IsRoundExhausted --- + +// TestQwpHostTrackerIsRoundExhausted exercises the predicate in +// each meaningful phase. +func TestQwpHostTrackerIsRoundExhausted(t *testing.T) { + tr := newQwpHostTracker(2, "", qwpTargetAny) + assert.False(t, tr.IsRoundExhausted(), "fresh tracker is not exhausted") + + tr.RecordTransportError(0) + assert.False(t, tr.IsRoundExhausted(), "one of two attempted") + + tr.RecordTransportError(1) + assert.True(t, tr.IsRoundExhausted(), "both attempted") + + tr.BeginRound(false) + assert.False(t, tr.IsRoundExhausted(), "BeginRound clears attempted") +} + +// TestQwpHostTrackerStringers covers the diagnostic stringers so a +// future change that adds a state / tier doesn't silently produce +// "Invalid" in error messages. +func TestQwpHostTrackerStringers(t *testing.T) { + assert.Equal(t, "Healthy", qwpHostHealthy.String()) + assert.Equal(t, "Unknown", qwpHostUnknown.String()) + assert.Equal(t, "TransientReject", qwpHostTransientReject.String()) + assert.Equal(t, "TransportError", qwpHostTransportError.String()) + assert.Equal(t, "TopologyReject", qwpHostTopologyReject.String()) + assert.Equal(t, "Same", qwpZoneSame.String()) + assert.Equal(t, "Unknown", qwpZoneUnknown.String()) + assert.Equal(t, "Other", qwpZoneOther.String()) +} diff --git a/qwp_ingress_oracle_fuzz_test.go b/qwp_ingress_oracle_fuzz_test.go new file mode 100644 index 00000000..d03900a3 --- /dev/null +++ b/qwp_ingress_oracle_fuzz_test.go @@ -0,0 +1,1741 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpIngressOracleFuzzTest (the multi-sender, +// no-bounce scenario). Every row the test intends to publish is +// materialised up front as an oracleRow (covering the full QWP-only +// type system: bool / all int widths / float / double / char / string / +// symbol / uuid / long256 / nanosecond timestamp / decimal 64-128-256 / +// 1D-2D-3D double arrays) and added to an oracleTable keyed by +// (ts, id). Concurrent producer goroutines each own a contiguous slice +// of rows and publish them through the Go QWP sender into a DEDUP +// UPSERT KEYS(ts, id) table. After ingestion, every cell of every row +// is asserted against the oracle via a `SELECT * ORDER BY ts, id` +// streamed back over the QWP query client. Because the oracle is +// pre-generated and (ts, id) is globally unique, any wire-level replay +// collapses cleanly under DEDUP and cannot drift the contract. +// +// Faithful-port divergences from the Java source (cf. the egress / +// bind / bounds ports' headers): +// +// - No server bounce / no sf_dir / no async-connect. The Java suite's +// bounce-torture, restart-replay, and async-connect scenarios need +// a controllable start/stop server (RestartableQwpServer); the Go +// fixture is a shared long-lived server (and only fixture-launched +// mode could bounce it). This slice ports the pure correctness +// property — concurrent multi-sender ingest + DEDUP + full-type +// round-trip — and runs against the shared server (live or +// fixture-launched). The deferred scenarios are tracked separately. +// - Verification is via QWP `SELECT * ORDER BY ts, id` over the query +// client, not an in-process RecordCursor; absent/skipped cells are +// asserted NULL via QwpColumnBatch.IsNull (mirrors +// QwpTable.assertCursor). +// - Decimal values are kept non-negative; Java's per-decimal sign +// flip (two's-complement limb negation) is deferred — it is +// orthogonal to the ingest/dedup/round-trip property and is the +// part most likely to need its own debugging pass. Scales and +// auto-precision-extra columns are still fully exercised. +// - Per-producer auto_flush_rows variation is simplified to explicit +// Flush() at batch boundaries (correctness-equivalent without +// bounces). Row counts are bounded smaller than the Java suite to +// keep CI time in check while still crossing batch boundaries and +// stressing DEDUP under concurrency. +// - Reproducible via QWP_FUZZ_SEED (shared newFuzzRand). + +import ( + "context" + "fmt" + "math/big" + "math/rand" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +const ( + oracleTableName = "qwp_oracle_fuzz" + oracleColumnSkip = 8 // ~12% of rows skip a base column + oracleNewColumn = 16 // ~6% of rows inject an extra column + oracleNonASCII = 4 // ~25% of string/symbol values get a non-ASCII suffix + oracleBaseTsMicros = int64(1_700_000_000_000_000) +) + +// oracleCreateSQL is the DEDUP target-table DDL shared by the ingress +// oracle tests (mirrors QwpIngressOracleFuzzTest.createTargetTable). +const oracleCreateSQL = "CREATE TABLE " + oracleTableName + " (" + + "id LONG, b BOOLEAN, b8 BYTE, s16 SHORT, c CHAR, i INT, l LONG, " + + "f FLOAT, d DOUBLE, s STRING, sym SYMBOL, u UUID, l256 LONG256, " + + "tn TIMESTAMP_NS, da DOUBLE[], da2 DOUBLE[][], da3 DOUBLE[][][], " + + "dec64 DECIMAL(12,3), dec128 DECIMAL(25,4), dec256 DECIMAL(50,6), " + + "ts TIMESTAMP) TIMESTAMP(ts) PARTITION BY DAY WAL " + + "DEDUP UPSERT KEYS(ts, id)" + +// oracleNonASCIISuffixes spans the UTF-8 byte-length spectrum (2/3/4 +// byte) so the wire path exercises multi-byte encoding. +var oracleNonASCIISuffixes = []string{ + "é", "ñ", "ж", "Я", "日", "中", "한", "🎉", +} + +// --- typed oracle cell ------------------------------------------------- + +type oracleKind int + +const ( + ocAbsent oracleKind = iota // column not written this row -> expect NULL + ocBool + ocByte + ocShort + ocChar + ocInt + ocLong + ocFloat + ocDouble + ocString + ocSymbol + ocUUID + ocLong256 + ocTsNano + ocDec64 + ocDec128 + ocDec256 + ocArr // 1D/2D/3D double array (flattened row-major + shape) +) + +type oracleCell struct { + kind oracleKind + // scalars + i64 int64 // byte/short/int/long/tsnano/dec64-unscaled + f64 float64 // float (as float64 of the float32) / double + b bool + ch rune + str string + // uuid + uhi, ulo int64 + // long256: words[0] = least-significant + words [4]int64 + // decimal + dec *big.Int // unscaled (dec128/dec256); dec64 uses i64 + scale int + // array + arr []float64 // flattened row-major + shape []int +} + +type oracleRow struct { + id int64 + tsMicros int64 + cells map[string]oracleCell +} + +func newOracleRow(id, tsMicros int64) *oracleRow { + return &oracleRow{id: id, tsMicros: tsMicros, cells: make(map[string]oracleCell, 24)} +} + +func (r *oracleRow) set(name string, c oracleCell) { r.cells[name] = c } + +// oracleTable is the pre-generated expectation: rows in (ts, id) order +// (== generation order, since ts/id are globally unique and monotonic +// with the global index) plus the set of every column name ever +// written (so a SELECT * column the oracle never set can be asserted +// as wholly absent). +type oracleTable struct { + rows []*oracleRow + colNames map[string]struct{} +} + +func newOracleTable() *oracleTable { + return &oracleTable{colNames: make(map[string]struct{}, 64)} +} + +func (t *oracleTable) addRow(r *oracleRow) { + t.rows = append(t.rows, r) + for n := range r.cells { + t.colNames[n] = struct{}{} + } +} + +// --- random value generation (faithful port of generateRow) ---------- + +func oracleShouldFuzz(r *rand.Rand, factor int) bool { + return factor > 0 && r.Intn(factor) == 0 +} + +func oracleMaybeNegateF(r *rand.Rand, v float64) float64 { + if r.Intn(2) == 0 { + return -v + } + return v +} + +func oracleMaybeNegateI(r *rand.Rand, v int64) int64 { + if r.Intn(2) == 0 { + return -v + } + return v +} + +func oracleMaybeNonASCII(r *rand.Rand) string { + if oracleShouldFuzz(r, oracleNonASCII) { + return oracleNonASCIISuffixes[r.Intn(len(oracleNonASCIISuffixes))] + } + return "" +} + +func oracleArr1d(id int64, sign float64) ([]float64, []int) { + return []float64{float64(id) * sign, float64(id) * 2 * sign, float64(id) * 3 * sign}, []int{3} +} + +func oracleArr2d(id int64, sign float64) ([]float64, []int) { + return []float64{ + float64(id) * sign, float64(id) * 2 * sign, + float64(id) * 3 * sign, float64(id) * 4 * sign, + }, []int{2, 2} +} + +func oracleArr3d(id int64, sign float64) ([]float64, []int) { + out := make([]float64, 0, 12) + for _, m := range []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12} { + out = append(out, float64(id)*m*sign) + } + return out, []int{2, 2, 3} +} + +func oracleSign(r *rand.Rand) float64 { + if r.Intn(2) == 0 { + return -1.0 + } + return 1.0 +} + +// u128 builds a non-negative big.Int from hi:lo (unsigned 64-bit limbs). +func u128(hi, lo uint64) *big.Int { + h := new(big.Int).SetUint64(hi) + h.Lsh(h, 64) + return h.Or(h, new(big.Int).SetUint64(lo)) +} + +// u256 builds a non-negative big.Int from hh:hl:lh:ll (unsigned limbs). +func u256(hh, hl, lh, ll uint64) *big.Int { + v := new(big.Int).SetUint64(hh) + for _, limb := range []uint64{hl, lh, ll} { + v.Lsh(v, 64) + v.Or(v, new(big.Int).SetUint64(limb)) + } + return v +} + +func oracleGenerateRow(r *rand.Rand, id, tsMicros int64) *oracleRow { + row := newOracleRow(id, tsMicros) + + // BOOLEAN/BYTE/SHORT/CHAR are mandatory: no NULL representation, so + // an absent cell would be indistinguishable from a stored zero. + row.set("b", oracleCell{kind: ocBool, b: (id & 1) == 0}) + bv := byte((id & 0x7F)) + if r.Intn(2) == 0 { + bv -= 0x40 + } + row.set("b8", oracleCell{kind: ocByte, i64: int64(int8(bv))}) + row.set("s16", oracleCell{kind: ocShort, i64: oracleMaybeNegateI(r, (id*31)&0x7FFF)}) + row.set("c", oracleCell{kind: ocChar, ch: rune('A' + (id & 0x1F))}) + + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("i", oracleCell{kind: ocInt, i64: oracleMaybeNegateI(r, (id*65537)&0x7FFFFFFF)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("l", oracleCell{kind: ocLong, i64: oracleMaybeNegateI(r, id*1_000_003)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("f", oracleCell{kind: ocFloat, f64: float64(float32(oracleMaybeNegateF(r, float64(id)*0.125)))}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("d", oracleCell{kind: ocDouble, f64: oracleMaybeNegateF(r, float64(id)*1.5)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("s", oracleCell{kind: ocString, str: "s_" + strconv.FormatInt(id, 10) + oracleMaybeNonASCII(r)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("sym", oracleCell{kind: ocSymbol, str: "sym_" + strconv.FormatInt(id&0xF, 10) + oracleMaybeNonASCII(r)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("u", oracleCell{kind: ocUUID, + uhi: id*0x00000000CAFEBABE + 17, + ulo: id*0x00000000DEADBEEF - 13}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("l256", oracleCell{kind: ocLong256, words: [4]int64{ + id*0x11111111 + 1, + id*0x22222222 + 2, + id*0x33333333 + 3, + id*0x44444444 + 4, + }}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("tn", oracleCell{kind: ocTsNano, i64: tsMicros*1_000 + (id & 0x3FF)}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + a, sh := oracleArr1d(id, oracleSign(r)) + row.set("da", oracleCell{kind: ocArr, arr: a, shape: sh}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + a, sh := oracleArr2d(id, oracleSign(r)) + row.set("da2", oracleCell{kind: ocArr, arr: a, shape: sh}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + a, sh := oracleArr3d(id, oracleSign(r)) + row.set("da3", oracleCell{kind: ocArr, arr: a, shape: sh}) + } + // Decimals: non-negative magnitudes inside each declared precision + // (see createTargetTable). dec64 DECIMAL(12,3), dec128 DECIMAL(25,4), + // dec256 DECIMAL(50,6). + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("dec64", oracleCell{kind: ocDec64, i64: id*10_000_007 + 13, scale: 3}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("dec128", oracleCell{kind: ocDec128, + dec: u128(uint64(id*40+7), uint64(id*0x00000000DEADBEEF+17)), + scale: 4}) + } + if !oracleShouldFuzz(r, oracleColumnSkip) { + row.set("dec256", oracleCell{kind: ocDec256, + dec: u256(0, + uint64(id*0x123456+31), + uint64(id*0x00000000CAFEBABE+17), + uint64(id*0x00000000DEADBEEF+13)), + scale: 6}) + } + if oracleShouldFuzz(r, oracleNewColumn) { + oracleInjectExtra(r, row, id) + } + return row +} + +func oracleInjectExtra(r *rand.Rand, row *oracleRow, id int64) { + switch r.Intn(19) { + case 0: + row.set("ex_l_0", oracleCell{kind: ocLong, i64: oracleMaybeNegateI(r, id*7)}) + case 1: + row.set("ex_l_1", oracleCell{kind: ocLong, i64: oracleMaybeNegateI(r, id+100)}) + case 2: + row.set("ex_l_2", oracleCell{kind: ocLong, i64: oracleMaybeNegateI(r, id)}) + case 3: + row.set("ex_d_0", oracleCell{kind: ocDouble, f64: oracleMaybeNegateF(r, float64(id)*0.25)}) + case 4: + row.set("ex_d_1", oracleCell{kind: ocDouble, f64: oracleMaybeNegateF(r, float64(id))}) + case 5: + row.set("ex_d_2", oracleCell{kind: ocDouble, f64: oracleMaybeNegateF(r, float64(id)*13.7)}) + case 6: + row.set("ex_s_0", oracleCell{kind: ocString, str: "ex0_" + strconv.FormatInt(id, 10) + oracleMaybeNonASCII(r)}) + case 7: + row.set("ex_s_1", oracleCell{kind: ocString, str: "ex1_" + strconv.FormatInt(id, 10) + oracleMaybeNonASCII(r)}) + case 8: + row.set("ex_sym_0", oracleCell{kind: ocSymbol, str: "exsym0_" + strconv.FormatInt(id&0x7, 10) + oracleMaybeNonASCII(r)}) + case 9: + row.set("ex_sym_1", oracleCell{kind: ocSymbol, str: "exsym1_" + strconv.FormatInt(id&0x3, 10) + oracleMaybeNonASCII(r)}) + case 10: + sign := oracleSign(r) + row.set("ex_da_0", oracleCell{kind: ocArr, + arr: []float64{float64(id) * sign, float64(id+1) * sign, float64(id+2) * sign}, + shape: []int{3}}) + case 11: + scale := r.Intn(16) + row.set("ex_dec64_s"+strconv.Itoa(scale), oracleCell{kind: ocDec64, i64: id*7 + 11, scale: scale}) + case 12: + scale := r.Intn(19) + row.set("ex_dec128_s"+strconv.Itoa(scale), oracleCell{kind: ocDec128, + dec: u128(uint64(id*11+3), uint64(id*0x00000000DEADBEEF+17)), + scale: scale}) + case 13: + scale := r.Intn(31) + row.set("ex_dec256_s"+strconv.Itoa(scale), oracleCell{kind: ocDec256, + dec: u256(uint64(id*0x00000000ABCDEF01+7), + uint64(id*0x123456+31), + uint64(id*0x00000000CAFEBABE+17), + uint64(id*0x00000000DEADBEEF+13)), + scale: scale}) + case 14: + row.set("ex_i_0", oracleCell{kind: ocInt, i64: oracleMaybeNegateI(r, (id*65537)&0x7FFFFFFF)}) + case 15: + row.set("ex_f_0", oracleCell{kind: ocFloat, f64: float64(float32(oracleMaybeNegateF(r, float64(id)*0.0625)))}) + case 16: + row.set("ex_u_0", oracleCell{kind: ocUUID, uhi: id*0x00000000ABCD1234 + 5, ulo: id*0x000000005678FEDC + 11}) + case 17: + row.set("ex_l256_0", oracleCell{kind: ocLong256, words: [4]int64{ + id*0x0F0F0F0F + 1, id*0x1E1E1E1E + 2, id*0x2D2D2D2D + 3, id*0x3C3C3C3C + 4, + }}) + case 18: + row.set("ex_tn_0", oracleCell{kind: ocTsNano, i64: row.tsMicros*1_000 + (id & 0x1FF)}) + } +} + +// --- publish a row through the QWP sender ---------------------------- + +func oraclePublish(t *testing.T, qs QwpSender, ctx context.Context, row *oracleRow) { + t.Helper() + qs.Table(oracleTableName) + // Symbols must precede non-symbol columns (ILP/QWP ordering); map + // iteration order is random, so emit symbols in a first pass. + for name, c := range row.cells { + if c.kind == ocSymbol { + qs.Symbol(name, c.str) + } + } + qs.Int64Column("id", row.id) + for name, c := range row.cells { + switch c.kind { + case ocSymbol: + // already emitted in the symbol pass above + case ocBool: + qs.BoolColumn(name, c.b) + case ocByte: + qs.ByteColumn(name, int8(c.i64)) + case ocShort: + qs.ShortColumn(name, int16(c.i64)) + case ocChar: + qs.CharColumn(name, c.ch) + case ocInt: + qs.Int32Column(name, int32(c.i64)) + case ocLong: + qs.Int64Column(name, c.i64) + case ocFloat: + qs.Float32Column(name, float32(c.f64)) + case ocDouble: + qs.Float64Column(name, c.f64) + case ocString: + qs.StringColumn(name, c.str) + case ocUUID: + qs.UuidColumn(name, uint64(c.uhi), uint64(c.ulo)) + case ocLong256: + v := u256(uint64(c.words[3]), uint64(c.words[2]), uint64(c.words[1]), uint64(c.words[0])) + qs.Long256Column(name, v) + case ocTsNano: + qs.TimestampNanosColumn(name, time.Unix(0, c.i64).UTC()) + case ocDec64: + qs.Decimal64Column(name, NewDecimalFromInt64(c.i64, uint32(c.scale))) + case ocDec128: + d, err := NewDecimal(c.dec, uint32(c.scale)) + if err != nil { + t.Fatalf("NewDecimal(dec128 %s): %v", name, err) + } + qs.Decimal128Column(name, d) + case ocDec256: + d, err := NewDecimal(c.dec, uint32(c.scale)) + if err != nil { + t.Fatalf("NewDecimal(dec256 %s): %v", name, err) + } + qs.Decimal256Column(name, d) + case ocArr: + switch len(c.shape) { + case 1: + qs.Float64Array1DColumn(name, c.arr) + case 2: + qs.Float64Array2DColumn(name, oracleUnflatten2d(c.arr, c.shape)) + case 3: + qs.Float64Array3DColumn(name, oracleUnflatten3d(c.arr, c.shape)) + } + } + } + if err := qs.At(ctx, time.UnixMicro(row.tsMicros).UTC()); err != nil { + t.Fatalf("sender.At(id=%d): %v", row.id, err) + } +} + +func oracleUnflatten2d(flat []float64, shape []int) [][]float64 { + out := make([][]float64, shape[0]) + for i := 0; i < shape[0]; i++ { + out[i] = flat[i*shape[1] : (i+1)*shape[1]] + } + return out +} + +func oracleUnflatten3d(flat []float64, shape []int) [][][]float64 { + out := make([][][]float64, shape[0]) + k := 0 + for i := 0; i < shape[0]; i++ { + out[i] = make([][]float64, shape[1]) + for j := 0; j < shape[1]; j++ { + out[i][j] = flat[k : k+shape[2]] + k += shape[2] + } + } + return out +} + +// --- verification: SELECT * ORDER BY ts, id vs the oracle ------------ + +func oracleAssert(t *testing.T, c *QwpQueryClient, table *oracleTable) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT * FROM "+oracleTableName+" ORDER BY ts, id") + defer q.Close() + + rowIdx := 0 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("oracle query: %v", err) + } + // Map column name -> batch column index for this batch. + colIdx := make(map[string]int, batch.ColumnCount()) + for i := 0; i < batch.ColumnCount(); i++ { + colIdx[batch.ColumnName(i)] = i + } + for br := 0; br < batch.RowCount(); br++ { + if rowIdx >= len(table.rows) { + t.Fatalf("more rows returned (%d) than the oracle holds (%d)", + rowIdx+1, len(table.rows)) + } + want := table.rows[rowIdx] + rowIdx++ + + idCi, ok := colIdx["id"] + if !ok { + t.Fatalf("row %d: SELECT * missing mandatory column \"id\"", rowIdx-1) + } + if got := batch.Int64(idCi, br); got != want.id { + t.Fatalf("row %d id: want %d got %d", rowIdx-1, want.id, got) + } + tsCi, ok := colIdx["ts"] + if !ok { + t.Fatalf("id=%d: SELECT * missing mandatory column \"ts\"", want.id) + } + if got := batch.Int64(tsCi, br); got != want.tsMicros { + t.Fatalf("id=%d ts: want %d got %d", want.id, want.tsMicros, got) + } + for name := range table.colNames { + ci, present := colIdx[name] + cell, set := want.cells[name] + if !present { + // Column never created at all: the oracle must + // also never have written it. + if set { + t.Fatalf("id=%d: column %q set in oracle but absent from schema", + want.id, name) + } + continue + } + if !set || cell.kind == ocAbsent { + if !batch.IsNull(ci, br) { + t.Fatalf("id=%d col %q: expected NULL (unset), got non-null", want.id, name) + } + continue + } + if batch.IsNull(ci, br) { + t.Fatalf("id=%d col %q: expected value, got NULL", want.id, name) + } + oracleAssertCell(t, batch, ci, br, name, want.id, cell) + } + } + } + if rowIdx != len(table.rows) { + t.Fatalf("row count: oracle holds %d, query returned %d", len(table.rows), rowIdx) + } +} + +func oracleAssertCell(t *testing.T, b *QwpColumnBatch, ci, br int, name string, id int64, c oracleCell) { + t.Helper() + switch c.kind { + case ocBool: + if got := b.Bool(ci, br); got != c.b { + t.Fatalf("id=%d %s: want %v got %v", id, name, c.b, got) + } + case ocByte: + if got := int64(b.Int8(ci, br)); got != c.i64 { + t.Fatalf("id=%d %s(byte): want %d got %d", id, name, c.i64, got) + } + case ocShort: + if got := int64(b.Int16(ci, br)); got != c.i64 { + t.Fatalf("id=%d %s(short): want %d got %d", id, name, c.i64, got) + } + case ocChar: + if got := b.Char(ci, br); got != c.ch { + t.Fatalf("id=%d %s(char): want %q got %q", id, name, c.ch, got) + } + case ocInt: + if got := int64(b.Int32(ci, br)); got != c.i64 { + t.Fatalf("id=%d %s(int): want %d got %d", id, name, c.i64, got) + } + case ocLong: + if got := b.Int64(ci, br); got != c.i64 { + t.Fatalf("id=%d %s(long): want %d got %d", id, name, c.i64, got) + } + case ocFloat: + if got := float64(b.Float32(ci, br)); got != c.f64 { + t.Fatalf("id=%d %s(float): want %v got %v", id, name, c.f64, got) + } + case ocDouble: + if got := b.Float64(ci, br); got != c.f64 { + t.Fatalf("id=%d %s(double): want %v got %v", id, name, c.f64, got) + } + case ocString, ocSymbol: + if got := b.String(ci, br); got != c.str { + t.Fatalf("id=%d %s(str): want %q got %q", id, name, c.str, got) + } + case ocUUID: + if gh, gl := b.UuidHi(ci, br), b.UuidLo(ci, br); gh != c.uhi || gl != c.ulo { + t.Fatalf("id=%d %s(uuid): want hi=%d lo=%d got hi=%d lo=%d", + id, name, c.uhi, c.ulo, gh, gl) + } + case ocLong256: + for w := 0; w < 4; w++ { + if got := b.Long256Word(ci, br, w); got != c.words[w] { + t.Fatalf("id=%d %s(long256) word%d: want %d got %d", + id, name, w, c.words[w], got) + } + } + case ocTsNano: + if got := b.Int64(ci, br); got != c.i64 { + t.Fatalf("id=%d %s(tsnano): want %d got %d", id, name, c.i64, got) + } + case ocDec64: + if got := b.Int64(ci, br); got != c.i64 { + t.Fatalf("id=%d %s(dec64): want unscaled %d got %d", id, name, c.i64, got) + } + if got := b.DecimalScale(ci); got != c.scale { + t.Fatalf("id=%d %s(dec64) scale: want %d got %d", id, name, c.scale, got) + } + case ocDec128: + got := u128(uint64(b.Decimal128Hi(ci, br)), uint64(b.Decimal128Lo(ci, br))) + if got.Cmp(c.dec) != 0 { + t.Fatalf("id=%d %s(dec128): want %s got %s", id, name, c.dec, got) + } + if gs := b.DecimalScale(ci); gs != c.scale { + t.Fatalf("id=%d %s(dec128) scale: want %d got %d", id, name, c.scale, gs) + } + case ocDec256: + got := u256( + uint64(b.Long256Word(ci, br, 3)), + uint64(b.Long256Word(ci, br, 2)), + uint64(b.Long256Word(ci, br, 1)), + uint64(b.Long256Word(ci, br, 0)), + ) + if got.Cmp(c.dec) != 0 { + t.Fatalf("id=%d %s(dec256): want %s got %s", id, name, c.dec, got) + } + if gs := b.DecimalScale(ci); gs != c.scale { + t.Fatalf("id=%d %s(dec256) scale: want %d got %d", id, name, c.scale, gs) + } + case ocArr: + nd := b.ArrayNDims(ci, br) + if nd != len(c.shape) { + t.Fatalf("id=%d %s(arr) ndims: want %d got %d", id, name, len(c.shape), nd) + } + for d := 0; d < nd; d++ { + if got := b.ArrayDim(ci, br, d); got != c.shape[d] { + t.Fatalf("id=%d %s(arr) dim%d: want %d got %d", id, name, d, c.shape[d], got) + } + } + got := b.Float64Array(ci, br) + if len(got) != len(c.arr) { + t.Fatalf("id=%d %s(arr) len: want %d got %d", id, name, len(c.arr), len(got)) + } + for k := range c.arr { + if got[k] != c.arr[k] { + t.Fatalf("id=%d %s(arr)[%d]: want %v got %v", id, name, k, c.arr[k], got[k]) + } + } + } +} + +// --- the test --------------------------------------------------------- + +func oracleNewSender(t *testing.T, srv *qwpFuzzServer) (QwpSender, func()) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ls, err := LineSenderFromConf(ctx, srv.connConf()) + if err != nil { + t.Fatalf("LineSenderFromConf(%q): %v", srv.connConf(), err) + } + qs, ok := ls.(QwpSender) + if !ok { + t.Fatalf("ws sender is not a QwpSender (%T)", ls) + } + closer := func() { + cctx, ccancel := context.WithTimeout(context.Background(), 30*time.Second) + defer ccancel() + _ = qs.Close(cctx) + } + return qs, closer +} + +// TestQwpFuzzIngressOracleMultiSender pre-generates a typed oracle, +// publishes it from several concurrent producer goroutines (each +// owning a contiguous, globally-unique (ts,id) slice) into a DEDUP +// table, then asserts every cell of every row via a streamed +// SELECT * ORDER BY ts, id. Catches per-type wire-encoding bugs, +// cross-batch misalignment, dedup/dup loss, and concurrency races. +func TestQwpFuzzIngressOracleMultiSender(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + + producerCount := 2 + r.Intn(3) // 2..4 + rowsPerProducer := 250 + r.Intn(350) // 250..599 (bounded for CI) + batchSizes := make([]int, producerCount) + for p := range batchSizes { + batchSizes[p] = 10 + r.Intn(60) // 10..69 + } + totalRows := producerCount * rowsPerProducer + t.Logf("ingress oracle: producers=%d rowsPerProducer=%d total=%d", + producerCount, rowsPerProducer, totalRows) + + // Fresh table each run; DEDUP UPSERT KEYS(ts,id) collapses any + // wire-level replay cleanly onto the pre-generated oracle. + srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + srv.mustExec(t, oracleCreateSQL) + + // Pre-generate: each producer owns a contiguous slice; ids and + // timestamps are globally unique and interleaved so ts,id order + // has a single deterministic interpretation. + oracle := newOracleTable() + perProducer := make([][]*oracleRow, producerCount) + var globalIdx int64 + for p := 0; p < producerCount; p++ { + genR := rand.New(rand.NewSource(r.Int63())) + perProducer[p] = make([]*oracleRow, rowsPerProducer) + for i := 0; i < rowsPerProducer; i++ { + id := globalIdx + ts := oracleBaseTsMicros + globalIdx + row := oracleGenerateRow(genR, id, ts) + perProducer[p][i] = row + oracle.addRow(row) + globalIdx++ + } + } + + var wg sync.WaitGroup + errs := make([]error, producerCount) + for p := 0; p < producerCount; p++ { + wg.Add(1) + go func(p int) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[p] = fmt.Errorf("producer %d panicked: %v", p, rec) + } + }() + qs, closeSender := oracleNewSender(t, srv) + defer closeSender() + ctx := context.Background() + rows := perProducer[p] + bs := batchSizes[p] + for i := 0; i < len(rows); i++ { + oraclePublish(t, qs, ctx, rows[i]) + if (i+1)%bs == 0 { + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d flush@%d: %w", p, i, err) + return + } + } + } + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d final flush: %w", p, err) + } + }(p) + } + wg.Wait() + for p, e := range errs { + if e != nil { + t.Fatalf("producer %d: %v", p, e) + } + } + + // Wait for the WAL apply job to materialise every (ts,id). + srv.awaitRows(t, oracleTableName, totalRows, 120*time.Second) + + c := newBindFuzzClient(t, srv) // reused query-client helper + oracleAssert(t, c, oracle) +} + +// --- bounce-torture scenario ----------------------------------------- + +// oraclePickSfMaxBytes mirrors Java pickSfMaxBytes: small segments force +// frequent rotation (stresses purge bookkeeping), large segments resemble +// the production default. The chosen value also scales the post-close +// slot-purge bound. +func oraclePickSfMaxBytes(r *rand.Rand) int64 { + pool := []int64{256 * 1024, 1024 * 1024, 4 * 1024 * 1024} + return pool[r.Intn(len(pool))] +} + +// oracleSfDirSize sums every file under dir. The Go SF slot lives at +// //...; Java asserts /default. Summing the +// whole tree is faithful to the intent (slot purged after clean close) +// and robust to the exact nesting. Walk errors are returned so callers +// fail fast — silently returning 0 would let "sz > capBytes" pass +// vacuously when the directory was unreadable. +func oracleSfDirSize(dir string) (int64, error) { + var total int64 + err := filepath.Walk(dir, func(_ string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + total += info.Size() + return nil + }) + return total, err +} + +// oracleSenderFromConf builds a QwpSender from a hand-assembled connect +// string (sf_dir / reconnect / auto_flush tuning the shared +// oracleNewSender does not expose). The closer's ctx outlasts +// close_flush_timeout_millis=120000 so a clean drain across an +// in-flight bounce can complete. +func oracleSenderFromConf(t *testing.T, conf string) (QwpSender, func()) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ls, err := LineSenderFromConf(ctx, conf) + if err != nil { + t.Fatalf("LineSenderFromConf(%q): %v", conf, err) + } + qs, ok := ls.(QwpSender) + if !ok { + t.Fatalf("ws sender is not a QwpSender (%T)", ls) + } + closer := func() { + cctx, ccancel := context.WithTimeout(context.Background(), 150*time.Second) + defer ccancel() + _ = qs.Close(cctx) + } + return qs, closer +} + +// TestQwpFuzzIngressOracleMultiSenderBounce is the bounce-torture port of +// QwpIngressOracleFuzzTest.testOracleMultiSenderTortureUnderServerBounces: +// concurrent sf_dir-backed producers publish the pre-generated typed +// oracle while a bouncer SIGTERMs and restarts the server several times +// on the same port/dataDir. The Go SF send loop owns reconnect + replay +// from the last ACKed FSN, and DEDUP UPSERT KEYS(ts,id) collapses any +// wire-level replay, so the final table must match the oracle exactly +// with zero loss across every server outage. +// +// Faithful-port divergences (cf. the file header and the egress/bounds +// ports' headers): +// +// - Requires a fixture-LAUNCHED server (JDK+jar). In QDB_FUZZ_ADDR mode +// the fixture does not own the process and cannot bounce it, so the +// test skips — the non-bounce TestQwpFuzzIngressOracleMultiSender +// still covers the correctness property against any server. +// - The server down-interval is the fixture bounce()'s SIGTERM + fixed +// ~500ms gap + JVM reboot, not Java's 40-100ms in-process stop/start +// (a network-launched JVM cannot restart that fast). The property +// under test — reconnect + gap-free replay across a real outage on a +// stable port — is unchanged; a randomized post-bounce settle keeps +// producers spanning multiple up/down windows. +// - Row counts bounded smaller than the Java suite for CI time while +// still crossing batch boundaries and outliving multiple bounces. +// Decimals are non-negative (see the file header). +// - Reproducible via QWP_FUZZ_SEED (shared newFuzzRand). +func TestQwpFuzzIngressOracleMultiSenderBounce(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("bounce-torture needs a fixture-launched server; " + + "QDB_FUZZ_ADDR mode cannot restart the process") + } + r := newFuzzRand(t) + + producerCount := 2 + r.Intn(3) // 2..4 + rowsPerProducer := 300 + r.Intn(400) // 300..699 (CI-bounded) + bounces := 2 + r.Intn(3) // 2..4 + sfMaxBytes := oraclePickSfMaxBytes(r) + batchSizes := make([]int, producerCount) + autoFlush := make([]int, producerCount) + for p := 0; p < producerCount; p++ { + batchSizes[p] = 10 + r.Intn(80) // 10..89 + autoFlush[p] = 50 + r.Intn(200) // 50..249 + } + bRnd := rand.New(rand.NewSource(r.Int63())) + totalRows := producerCount * rowsPerProducer + t.Logf("ingress oracle bounce: producers=%d rows/producer=%d total=%d bounces=%d sf_max_bytes=%d", + producerCount, rowsPerProducer, totalRows, bounces, sfMaxBytes) + + srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + srv.mustExec(t, oracleCreateSQL) + + // Pre-generate: each producer owns a contiguous slice; ids and + // timestamps are globally unique so ts,id order is deterministic and + // every wire-level replay collapses cleanly under DEDUP. + oracle := newOracleTable() + perProducer := make([][]*oracleRow, producerCount) + var globalIdx int64 + for p := 0; p < producerCount; p++ { + genR := rand.New(rand.NewSource(r.Int63())) + perProducer[p] = make([]*oracleRow, rowsPerProducer) + for i := 0; i < rowsPerProducer; i++ { + id := globalIdx + ts := oracleBaseTsMicros + globalIdx + row := oracleGenerateRow(genR, id, ts) + perProducer[p][i] = row + oracle.addRow(row) + globalIdx++ + } + } + + sfRoot := t.TempDir() + sfDirs := make([]string, producerCount) + for p := 0; p < producerCount; p++ { + sfDirs[p] = filepath.Join(sfRoot, fmt.Sprintf("p%d", p)) + if err := os.MkdirAll(sfDirs[p], 0o755); err != nil { + t.Fatalf("mkdir sf_dir: %v", err) + } + } + + var wg sync.WaitGroup + errs := make([]error, producerCount) + for p := 0; p < producerCount; p++ { + wg.Add(1) + go func(p int) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[p] = fmt.Errorf("producer %d panicked: %v", p, rec) + } + }() + conf := fmt.Sprintf( + "ws::addr=%s;sf_dir=%s;initial_connect_retry=async;"+ + "reconnect_max_duration_millis=120000;"+ + "close_flush_timeout_millis=120000;"+ + "sf_max_bytes=%d;auto_flush_rows=%d;", + srv.wsAddr(), sfDirs[p], sfMaxBytes, autoFlush[p]) + qs, closeSender := oracleSenderFromConf(t, conf) + defer closeSender() + ctx := context.Background() + rows := perProducer[p] + bs := batchSizes[p] + written := 0 + for written < len(rows) { + end := min(written+bs, len(rows)) + for i := written; i < end; i++ { + oraclePublish(t, qs, ctx, rows[i]) + } + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d flush@%d: %w", p, written, err) + return + } + written = end + time.Sleep(time.Millisecond) // mirror Java Os.sleep(1) + } + }(p) + } + + bouncerDone := make(chan struct{}) + var bounceErr error + go func() { + defer close(bouncerDone) + time.Sleep(150 * time.Millisecond) // let producers warm up + for i := 0; i < bounces; i++ { + t.Logf("oracle bounce %d/%d", i+1, bounces) + if err := srv.bounce(); err != nil { + bounceErr = fmt.Errorf("bounce %d/%d: %w", i+1, bounces, err) + return + } + time.Sleep(time.Duration(150+bRnd.Intn(250)) * time.Millisecond) + } + }() + + // Match the Java ordering: join the bouncer, then the producers. + // Always drain producers before any t.Fatalf so no goroutine + // touches t after the test function returns. + <-bouncerDone + wg.Wait() + if bounceErr != nil { + t.Fatalf("%v", bounceErr) + } + for p, e := range errs { + if e != nil { + t.Fatalf("producer %d: %v", p, e) + } + } + + srv.awaitRows(t, oracleTableName, totalRows, 120*time.Second) + + c := newBindFuzzClient(t, srv) + oracleAssert(t, c, oracle) + + // Clean close ACKed every frame; the SF cursor unlinks rotated + // segments. A small residue (lock, ack-watermark, active header) is + // normal — Java's slotCapFor is sf_max_bytes + 256 KiB. + capBytes := sfMaxBytes + 256*1024 + for p, dir := range sfDirs { + sz, err := oracleSfDirSize(dir) + if err != nil { + t.Fatalf("producer %d sf_dir %q: walk failed: %v", p, dir, err) + } + if sz > capBytes { + t.Fatalf("producer %d sf_dir %q not purged after clean close: %d bytes (cap %d)", + p, dir, sz, capBytes) + } + } +} + +// --- poison-rows / per-frame-drop scenario --------------------------- + +// TestQwpFuzzIngressOraclePoisonErrorHandler ports +// QwpIngressOracleFuzzTest.testOraclePoisonRowsTriggerErrorHandler. It +// pins the per-batch error contract: +// +// 1. the async error handler fires for every poisoned chunk; +// 2. rows from clean chunks land exactly per the oracle; +// 3. no row from a poisoned chunk leaks — the WHOLE frame is dropped, +// including the well-formed rows next to the bad one (SF drops per +// frame, not per row). +// +// A poisoned chunk carries one row whose dec256 unscaled value is 2^192 +// (~6.3e57, 58 digits) — well past DECIMAL(50,6)'s 10^50 cap. The +// server returns CategoryWriteError, whose spec-default policy is +// DROP_AND_CONTINUE (qwp_sf_classify.go), so the producer keeps going +// and the rejection surfaces only via the async handler. No server +// bounce on purpose — the failure mode must be unambiguously the +// per-frame rejection, not a transport blip. +// +// Faithful-port divergences (cf. the file header and the bounce port): +// +// - The sender is built with NewLineSender(...) options rather than a +// connect string: Go has no conf+option combiner and WithErrorHandler +// is option-only. The options are 1:1 with the Java connect string +// (sf_dir, initial_connect_retry=true→sync, close_flush_timeout, +// error_inbox_capacity) plus the error handler. +// - reconnect_max_duration_millis is omitted (no outage in this +// scenario; the default budget is irrelevant). +// - Clean rows verified via the QWP query client (oracleAssert); +// poisoned-id absence via the fixture /exec count (mirrors Java's +// assertSql). Counts are CI-bounded; chunk size stays small enough +// to map to a single frame so the per-frame drop is deterministic. +// - errCalls >= poisoned-chunk count (inequality, like Java: tolerates +// the rare chunk that splits across more than one frame); upper +// bound 3x catches "handler fires N times per chunk" regressions +// the Java port doesn't guard. +// - Goes beyond the Java port: also captures one delivered +// *SenderError and asserts Category == CategoryWriteError and +// AppliedPolicy == PolicyDropAndContinue, so a misclassification +// (wrong status byte → wrong category) or a policy-resolution +// regression cannot pass silently behind the call-count alone. +// - Reproducible via QWP_FUZZ_SEED (shared newFuzzRand). +func TestQwpFuzzIngressOraclePoisonErrorHandler(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + + producerCount := 2 + r.Intn(2) // 2..3 + chunksPerProducer := 30 + r.Intn(30) // 30..59 + chunkSize := 5 + r.Intn(6) // 5..10 rows (maps to one frame) + const poisonChunkInN = 4 // ~25% of chunks poisoned + sfMaxBytes := oraclePickSfMaxBytes(r) // shared with the bounce port + + // Constructible client-side? 2^192 is 58 digits — inside Decimal256's + // 76-digit envelope, so NewDecimal accepts it and the rejection is + // purely server-side (the whole point of the poison). + if _, e := NewDecimal(u256(1, 0, 0, 0), 6); e != nil { + t.Fatalf("poison value 2^192 not constructible client-side: %v", e) + } + + oracle := newOracleTable() + perProducerChunks := make([][][]*oracleRow, producerCount) + var poisonedIDs []string + totalPoisonedChunks := 0 + var globalIdx int64 + for p := 0; p < producerCount; p++ { + genR := rand.New(rand.NewSource(r.Int63())) + poisonR := rand.New(rand.NewSource(r.Int63())) + perProducerChunks[p] = make([][]*oracleRow, chunksPerProducer) + for c := 0; c < chunksPerProducer; c++ { + poisoned := poisonR.Intn(poisonChunkInN) == 0 + if poisoned { + totalPoisonedChunks++ + } + chunk := make([]*oracleRow, chunkSize) + for rr := 0; rr < chunkSize; rr++ { + id := globalIdx + ts := oracleBaseTsMicros + globalIdx + row := oracleGenerateRow(genR, id, ts) + if poisoned { + // Force dec256 past the column cap. setSignedDecimal + // is unconditional in Java; overwrite whatever + // generateRow produced (skipped or not). + row.set("dec256", oracleCell{kind: ocDec256, dec: u256(1, 0, 0, 0), scale: 6}) + poisonedIDs = append(poisonedIDs, strconv.FormatInt(id, 10)) + } else { + oracle.addRow(row) + } + chunk[rr] = row + globalIdx++ + } + perProducerChunks[p][c] = chunk + } + } + cleanRows := len(oracle.rows) + t.Logf("ingress oracle poison: producers=%d chunks/producer=%d chunkSize=%d "+ + "poisonedChunks=%d cleanRows=%d sf_max_bytes=%d", + producerCount, chunksPerProducer, chunkSize, totalPoisonedChunks, cleanRows, sfMaxBytes) + + srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + srv.mustExec(t, oracleCreateSQL) + + sfRoot := t.TempDir() + sfDirs := make([]string, producerCount) + for p := 0; p < producerCount; p++ { + sfDirs[p] = filepath.Join(sfRoot, fmt.Sprintf("p%d", p)) + if err := os.MkdirAll(sfDirs[p], 0o755); err != nil { + t.Fatalf("mkdir sf_dir: %v", err) + } + } + + var errCalls atomic.Int64 // shared across every producer's handler + // Capture one delivered *SenderError so we can assert Category and + // AppliedPolicy. Without this the call-count alone would let a + // regression that misclassifies the dec256 poison (or resolves the + // policy to HALT) sneak past as long as *some* error fires. + var ( + firstErrMu sync.Mutex + firstErr *SenderError + ) + var wg sync.WaitGroup + errs := make([]error, producerCount) + for p := 0; p < producerCount; p++ { + wg.Add(1) + go func(p int) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[p] = fmt.Errorf("producer %d panicked: %v", p, rec) + } + }() + ctx := context.Background() + ls, err := NewLineSender(ctx, + WithQwp(), + WithAddress(srv.wsAddr()), + WithSfDir(sfDirs[p]), + WithSfMaxBytes(sfMaxBytes), + WithInitialConnectRetry(true), // initial_connect_retry=true (sync) + WithCloseFlushTimeout(120*time.Second), + WithErrorInboxCapacity(4096), + WithErrorHandler(func(e *SenderError) { + errCalls.Add(1) + if e == nil { + return + } + firstErrMu.Lock() + if firstErr == nil { + firstErr = e + } + firstErrMu.Unlock() + }), + ) + if err != nil { + errs[p] = fmt.Errorf("producer %d NewLineSender: %w", p, err) + return + } + qs, ok := ls.(QwpSender) + if !ok { + errs[p] = fmt.Errorf("producer %d: ws sender is not a QwpSender (%T)", p, ls) + _ = ls.Close(ctx) + return + } + defer func() { + cctx, ccancel := context.WithTimeout(context.Background(), 150*time.Second) + defer ccancel() + _ = qs.Close(cctx) + }() + for c := 0; c < len(perProducerChunks[p]); c++ { + for _, row := range perProducerChunks[p][c] { + oraclePublish(t, qs, ctx, row) + } + // Explicit flush per chunk -> chunk == frame, so the + // per-frame drop is deterministic. DROP_AND_CONTINUE means + // Flush does NOT error on a poisoned chunk (no HALT latch). + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d flush chunk %d: %w", p, c, err) + return + } + } + }(p) + } + wg.Wait() + for p, e := range errs { + if e != nil { + t.Fatalf("producer %d: %v", p, e) + } + } + + // Poisoned frames are dropped, so the table converges to exactly the + // clean-row count (globally-unique ts,id + DEDUP -> no dup inflation). + srv.awaitRows(t, oracleTableName, cleanRows, 120*time.Second) + + // (a) Clean rows: every clean-chunk row lands once; oracle drives a + // typed cell-by-cell check (and asserts the row count is exact). + c := newBindFuzzClient(t, srv) + oracleAssert(t, c, oracle) + + // (b) Poisoned rows: not a single id from any poisoned chunk leaked + // -- this pins the per-frame drop (good rows in a bad frame are gone + // too). + if len(poisonedIDs) > 0 { + res, err := srv.execSQL("SELECT count() FROM '" + oracleTableName + + "' WHERE id IN (" + strings.Join(poisonedIDs, ",") + ")") + if err != nil { + t.Fatalf("poisoned-id count query: %v", err) + } + if len(res.Dataset) != 1 || len(res.Dataset[0]) != 1 { + t.Fatalf("poisoned-id count: unexpected shape %v", res.Dataset) + } + if n, ok := toInt64(res.Dataset[0][0]); !ok || n != 0 { + t.Fatalf("poisoned rows leaked: %d ids from poisoned chunks present "+ + "(expected 0) -- per-frame drop violated", n) + } + } + + // (c) Async notifications: at least one per poisoned chunk reached a + // handler. Lower-bound inequality tolerates a chunk split across + // >1 frame; upper bound (3x) catches a regression that fires the + // handler many times per rejection (e.g. one per row in the frame + // instead of one per frame). + got := errCalls.Load() + if got < int64(totalPoisonedChunks) { + t.Fatalf("error handler fired %d times, expected >= %d (poisoned chunks)", + got, totalPoisonedChunks) + } + if upper := int64(3 * totalPoisonedChunks); totalPoisonedChunks > 0 && got > upper { + t.Fatalf("error handler fired %d times, expected <= %d (3x poisoned chunks)", + got, upper) + } + // Inspect at least one delivered payload: misclassifying the + // dec256 overflow into a non-WriteError category, or resolving + // its policy to anything other than DROP_AND_CONTINUE, must + // fail the test even though the call count alone would still + // match. (A HALT resolution would also surface as a Flush error + // above, but we assert the policy here explicitly so the + // contract is self-documenting.) + if totalPoisonedChunks > 0 { + firstErrMu.Lock() + se := firstErr + firstErrMu.Unlock() + if se == nil { + t.Fatalf("error handler fired %d times but no *SenderError captured", got) + } + if se.Category != CategoryWriteError { + t.Fatalf("error handler: wrong category: got %s (status=0x%02X), "+ + "expected WRITE_ERROR; msg=%q", + se.Category, byte(se.ServerStatusByte), se.ServerMessage) + } + if se.AppliedPolicy != PolicyDropAndContinue { + t.Fatalf("error handler: wrong policy: got %s, expected DROP_AND_CONTINUE", + se.AppliedPolicy) + } + } + t.Logf("poison: poisonedChunks=%d handlerCalls=%d", totalPoisonedChunks, got) + + // Clean close ACKed/handled every frame; the SF cursor unlinks + // rotated segments. Java's slotCapFor: sf_max_bytes + 256 KiB. + capBytes := sfMaxBytes + 256*1024 + for p, dir := range sfDirs { + sz, err := oracleSfDirSize(dir) + if err != nil { + t.Fatalf("producer %d sf_dir %q: walk failed: %v", p, dir, err) + } + if sz > capBytes { + t.Fatalf("producer %d sf_dir %q not purged after clean close: %d bytes (cap %d)", + p, dir, sz, capBytes) + } + } +} + +// --- restart-replay scenario ----------------------------------------- + +// TestQwpFuzzIngressOracleSenderRestartReplay ports +// QwpIngressOracleFuzzTest.testOracleSenderRestartReplaysAcrossBounces. +// Each producer opens-and-closes a fresh sender repeatedly with +// close_flush_timeout_millis=0 so unacked frames stay on disk in the +// per-producer sf_dir. The next sender on the same slot adopts those +// frames and replays them (SF on-disk format is shared with the Java +// client, so the slot-recovery contract is the same). A bouncer +// interleaves a couple of server restarts. Each producer finishes with +// one drain-pass sender on default close_flush_timeout to ensure all +// residual frames have ACKed before the oracle check. +// +// Final state must match the oracle exactly — the property under test +// is "no row loss across sender close/reopen + server bounce, only +// dedup-collapsed wire-level replays." +// +// Faithful-port divergences (cf. file header + bounce / poison ports): +// +// - Uses fixture bounce() for the bouncer; same SIGTERM + ~500ms + +// JVM-reboot interval as the bounce-torture port. Needs +// fixture-launched mode (skips !owns). +// - The drain pass sets close_flush_timeout_millis=120000 explicitly +// (Java uses the default; equivalent intent — give the final pass +// time to ACK every residual frame). +// - Counts are CI-bounded; decimals non-negative. +// - Reproducible via QWP_FUZZ_SEED. +func TestQwpFuzzIngressOracleSenderRestartReplay(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("restart-replay needs a fixture-launched server " + + "(QDB_FUZZ_ADDR mode cannot bounce the process)") + } + r := newFuzzRand(t) + + producerCount := 2 + r.Intn(2) // 2..3 + rowsPerProducer := 300 + r.Intn(400) // 300..699 (CI-bounded) + bounces := 1 + r.Intn(2) // 1..2 + sfMaxBytes := oraclePickSfMaxBytes(r) + lifetimeSeeds := make([]int64, producerCount) + for p := 0; p < producerCount; p++ { + lifetimeSeeds[p] = r.Int63() + } + bRnd := rand.New(rand.NewSource(r.Int63())) + totalRows := producerCount * rowsPerProducer + t.Logf("ingress oracle restart-replay: producers=%d rows/producer=%d total=%d bounces=%d sf_max_bytes=%d", + producerCount, rowsPerProducer, totalRows, bounces, sfMaxBytes) + + srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + defer srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + srv.mustExec(t, oracleCreateSQL) + + oracle := newOracleTable() + perProducer := make([][]*oracleRow, producerCount) + var globalIdx int64 + for p := 0; p < producerCount; p++ { + genR := rand.New(rand.NewSource(r.Int63())) + perProducer[p] = make([]*oracleRow, rowsPerProducer) + for i := 0; i < rowsPerProducer; i++ { + id := globalIdx + ts := oracleBaseTsMicros + globalIdx + row := oracleGenerateRow(genR, id, ts) + perProducer[p][i] = row + oracle.addRow(row) + globalIdx++ + } + } + + sfRoot := t.TempDir() + sfDirs := make([]string, producerCount) + for p := 0; p < producerCount; p++ { + sfDirs[p] = filepath.Join(sfRoot, fmt.Sprintf("p%d", p)) + if err := os.MkdirAll(sfDirs[p], 0o755); err != nil { + t.Fatalf("mkdir sf_dir: %v", err) + } + } + + openSender := func(p int, conf string) (QwpSender, error) { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ls, err := LineSenderFromConf(ctx, conf) + if err != nil { + return nil, fmt.Errorf("producer %d open: %w", p, err) + } + qs, ok := ls.(QwpSender) + if !ok { + _ = ls.Close(ctx) + return nil, fmt.Errorf("producer %d: not a QwpSender (%T)", p, ls) + } + return qs, nil + } + closeSender := func(qs QwpSender) { + cctx, ccancel := context.WithTimeout(context.Background(), 150*time.Second) + defer ccancel() + _ = qs.Close(cctx) + } + + var wg sync.WaitGroup + errs := make([]error, producerCount) + for p := 0; p < producerCount; p++ { + wg.Add(1) + go func(p int) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[p] = fmt.Errorf("producer %d panicked: %v", p, rec) + } + }() + lifeR := rand.New(rand.NewSource(lifetimeSeeds[p])) + ctx := context.Background() + rows := perProducer[p] + loopConf := fmt.Sprintf( + "ws::addr=%s;sf_dir=%s;initial_connect_retry=async;"+ + "reconnect_max_duration_millis=120000;"+ + "sf_max_bytes=%d;close_flush_timeout_millis=0;", + srv.wsAddr(), sfDirs[p], sfMaxBytes) + written := 0 + for written < len(rows) { + chunk := 30 + lifeR.Intn(200) // 30..229 rows per sender + end := min(written+chunk, len(rows)) + qs, err := openSender(p, loopConf) + if err != nil { + errs[p] = err + return + } + for i := written; i < end; i++ { + oraclePublish(t, qs, ctx, rows[i]) + } + if lifeR.Intn(2) == 0 { + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d flush: %w", p, err) + closeSender(qs) + return + } + } + // Close with timeout=0 -> abandon any unacked frames to + // disk for the next sender on the same slot to adopt + // and replay. + closeSender(qs) + written = end + } + // Final drain pass: open one more sender with a generous + // close_flush_timeout so residual frames replay + ACK + // before the oracle check. + drainConf := fmt.Sprintf( + "ws::addr=%s;sf_dir=%s;initial_connect_retry=async;"+ + "reconnect_max_duration_millis=120000;"+ + "sf_max_bytes=%d;close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDirs[p], sfMaxBytes) + qs, err := openSender(p, drainConf) + if err != nil { + errs[p] = err + return + } + if err := qs.Flush(ctx); err != nil { + errs[p] = fmt.Errorf("producer %d drain flush: %w", p, err) + closeSender(qs) + return + } + closeSender(qs) + }(p) + } + + bouncerDone := make(chan struct{}) + var bounceErr error + go func() { + defer close(bouncerDone) + time.Sleep(200 * time.Millisecond) + for i := 0; i < bounces; i++ { + t.Logf("restart-replay bounce %d/%d", i+1, bounces) + if err := srv.bounce(); err != nil { + bounceErr = fmt.Errorf("bounce %d/%d: %w", i+1, bounces, err) + return + } + time.Sleep(time.Duration(300+bRnd.Intn(400)) * time.Millisecond) + } + }() + + <-bouncerDone + wg.Wait() + if bounceErr != nil { + t.Fatalf("%v", bounceErr) + } + for p, e := range errs { + if e != nil { + t.Fatalf("producer %d: %v", p, e) + } + } + + srv.awaitRows(t, oracleTableName, totalRows, 180*time.Second) + + c := newBindFuzzClient(t, srv) + oracleAssert(t, c, oracle) + + capBytes := sfMaxBytes + 256*1024 + for p, dir := range sfDirs { + sz, err := oracleSfDirSize(dir) + if err != nil { + t.Fatalf("producer %d sf_dir %q: walk failed: %v", p, dir, err) + } + if sz > capBytes { + t.Fatalf("producer %d sf_dir %q not purged after clean close: %d bytes (cap %d)", + p, dir, sz, capBytes) + } + } +} + +// --- async-connect-queues-before-server-starts scenario -------------- + +// TestQwpFuzzIngressOracleAsyncConnectQueues ports +// QwpIngressOracleFuzzTest.testOracleAsyncConnectQueuesBeforeServerStarts. +// The offline-first contract of initial_connect_retry=async: the +// sender constructor must return promptly even when nothing is +// listening, the producer thread keeps writing immediately, frames +// accumulate in sf_dir while the I/O thread retries connect in the +// background. Once the server is brought up, the queued frames drain. +// Final cell-by-cell oracle check confirms no loss across the +// offline -> online transition. +// +// Shape: pause the fixture so its port is closed; producers open +// async, publish everything, then assert +// QwpSender.TotalReconnectAttempts() >= 2 — the I/O thread bumps +// that counter via the OnAttempt callback inside connectWithBackoff +// before each dial, so >=2 proves the first dial completed +// (ECONNREFUSED) and the backoff loop kicked off a second. Only +// after that per-producer assertion does the producer signal +// "enqueued". A starter goroutine waits for the signal and calls +// start(); senders' close blocks on close_flush_timeout to drain. +// A regression where ASYNC silently degraded to "no retry until +// close" would publish + signal fine but fail the counter +// assertion before any port reopens. +// +// Faithful-port divergences (cf. file header + bounce / restart-replay +// / poison ports): +// +// - Needs the new fixture pause()/start() pair (skips !owns). The +// test always leaves the server up via t.Cleanup(start) regardless +// of outcome — start() is idempotent. +// - Constructor latency assertion: <2s for async mode (same as Java). +// - Counts are CI-bounded; decimals non-negative; reproducible via +// QWP_FUZZ_SEED. +func TestQwpFuzzIngressOracleAsyncConnectQueues(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("async-connect needs a fixture-launched server " + + "(QDB_FUZZ_ADDR mode cannot pause/resume the process)") + } + // Always restore the server to a running state — start() is + // idempotent so this is safe regardless of test outcome. + t.Cleanup(func() { + if err := srv.start(); err != nil { + t.Logf("cleanup: failed to restart server: %v", err) + return + } + if _, err := srv.execSQL("DROP TABLE IF EXISTS '" + oracleTableName + "'"); err != nil { + t.Logf("cleanup: drop table: %v", err) + } + }) + + r := newFuzzRand(t) + producerCount := 2 + r.Intn(2) // 2..3 + rowsPerProducer := 250 + r.Intn(400) // 250..649 (CI-bounded) + sfMaxBytes := oraclePickSfMaxBytes(r) + totalRows := producerCount * rowsPerProducer + t.Logf("ingress oracle async-connect: producers=%d rows/producer=%d total=%d sf_max_bytes=%d", + producerCount, rowsPerProducer, totalRows, sfMaxBytes) + + srv.mustExec(t, "DROP TABLE IF EXISTS '"+oracleTableName+"'") + srv.mustExec(t, oracleCreateSQL) + + // Pre-generate the oracle BEFORE pausing the server. + oracle := newOracleTable() + perProducer := make([][]*oracleRow, producerCount) + var globalIdx int64 + for p := 0; p < producerCount; p++ { + genR := rand.New(rand.NewSource(r.Int63())) + perProducer[p] = make([]*oracleRow, rowsPerProducer) + for i := 0; i < rowsPerProducer; i++ { + id := globalIdx + ts := oracleBaseTsMicros + globalIdx + row := oracleGenerateRow(genR, id, ts) + perProducer[p][i] = row + oracle.addRow(row) + globalIdx++ + } + } + + sfRoot := t.TempDir() + sfDirs := make([]string, producerCount) + for p := 0; p < producerCount; p++ { + sfDirs[p] = filepath.Join(sfRoot, fmt.Sprintf("p%d", p)) + if err := os.MkdirAll(sfDirs[p], 0o755); err != nil { + t.Fatalf("mkdir sf_dir: %v", err) + } + } + + // Bring the server down. From here until the starter goroutine + // calls srv.start(), the wsAddr port is closed. + srv.pause() + + var wg sync.WaitGroup + errs := make([]error, producerCount) + allEnqueued := make(chan struct{}, producerCount) + + for p := 0; p < producerCount; p++ { + wg.Add(1) + go func(p int) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[p] = fmt.Errorf("producer %d panicked: %v", p, rec) + } + }() + + conf := fmt.Sprintf( + "ws::addr=%s;sf_dir=%s;initial_connect_retry=async;"+ + "reconnect_max_duration_millis=120000;"+ + "reconnect_initial_backoff_millis=20;"+ + "reconnect_max_backoff_millis=200;"+ + "sf_max_bytes=%d;"+ + "close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDirs[p], sfMaxBytes) + + // Time the constructor: async mode must return promptly + // even when no server listens — the whole point. + openCtx, openCancel := context.WithTimeout(context.Background(), 15*time.Second) + t0 := time.Now() + ls, err := LineSenderFromConf(openCtx, conf) + openCancel() + ctorElapsed := time.Since(t0) + if err != nil { + errs[p] = fmt.Errorf("producer %d open: %w", p, err) + allEnqueued <- struct{}{} + return + } + if ctorElapsed > 500*time.Millisecond { + errs[p] = fmt.Errorf("producer %d: async ctor took %s (must be <500ms; offline path should not block on network)", p, ctorElapsed) + _ = ls.Close(context.Background()) + allEnqueued <- struct{}{} + return + } + qs, ok := ls.(QwpSender) + if !ok { + errs[p] = fmt.Errorf("producer %d: not a QwpSender (%T)", p, ls) + _ = ls.Close(context.Background()) + allEnqueued <- struct{}{} + return + } + + pubCtx := context.Background() + const chunkSize = 50 + rows := perProducer[p] + for i := 0; i < len(rows); i++ { + oraclePublish(t, qs, pubCtx, rows[i]) + if (i+1)%chunkSize == 0 { + if err := qs.Flush(pubCtx); err != nil { + errs[p] = fmt.Errorf("producer %d flush@%d: %w", p, i, err) + allEnqueued <- struct{}{} + cctx, ccancel := context.WithTimeout(context.Background(), 150*time.Second) + _ = qs.Close(cctx) + ccancel() + return + } + } + } + if err := qs.Flush(pubCtx); err != nil { + errs[p] = fmt.Errorf("producer %d final flush: %w", p, err) + } + // Prove the ASYNC contract before signaling. The I/O + // thread bumps TotalReconnectAttempts via OnAttempt + // inside connectWithBackoff *before* each dial, so + // >=2 is the unambiguous "first dial completed + // (ECONNREFUSED on the paused port) and the backoff + // loop entered the second iteration" signal — a value + // of 1 only proves a dial was initiated. + // reconnect_initial_backoff_millis=20 means the second + // attempt fires within ~40ms; we give 10s of slack for + // slow CI before declaring the offline-retry loop dead. + // Without this assertion a regression where ASYNC + // silently degraded to "no retry until close" would + // still pass: producers fill sf_dir locally, signal, + // and Close() drives a belated drain once start() runs. + if errs[p] == nil { + const minAttempts = int64(2) + deadline := time.Now().Add(10 * time.Second) + for { + if qs.TotalReconnectAttempts() >= minAttempts { + break + } + if time.Now().After(deadline) { + errs[p] = fmt.Errorf( + "producer %d: ASYNC contract violation — "+ + "TotalReconnectAttempts=%d after 10s with port closed "+ + "(want >=%d). Background offline-retry loop did not "+ + "execute at least one full ECONNREFUSED cycle; ASYNC "+ + "appears to have degraded to 'no retry until close'", + p, qs.TotalReconnectAttempts(), minAttempts) + break + } + time.Sleep(5 * time.Millisecond) + } + } + // Signal "everything enqueued to sf_dir" BEFORE the + // close-block. The wire is still in the offline-retry + // loop — it only comes up once the starter brings the + // server up and Close() drives the drain. + allEnqueued <- struct{}{} + cctx, ccancel := context.WithTimeout(context.Background(), 150*time.Second) + _ = qs.Close(cctx) + ccancel() + }(p) + } + + starterDone := make(chan struct{}) + var starterErr error + go func() { + defer close(starterDone) + enqWait := time.After(60 * time.Second) + seen := 0 + for seen < producerCount { + select { + case <-allEnqueued: + seen++ + case <-enqWait: + starterErr = fmt.Errorf("only %d/%d producers enqueued within 60s", seen, producerCount) + return + } + } + // Each producer has already asserted >=2 background + // connect attempts hit ECONNREFUSED before signaling, so + // the ASYNC contract is proven before we get here. Bring + // the server up so Close() can drain the queued frames. + if err := srv.start(); err != nil { + starterErr = fmt.Errorf("starter: %w", err) + } + }() + + <-starterDone + wg.Wait() + if starterErr != nil { + t.Fatalf("%v", starterErr) + } + for p, e := range errs { + if e != nil { + t.Fatalf("producer %d: %v", p, e) + } + } + + srv.awaitRows(t, oracleTableName, totalRows, 180*time.Second) + + c := newBindFuzzClient(t, srv) + oracleAssert(t, c, oracle) + + capBytes := sfMaxBytes + 256*1024 + for p, dir := range sfDirs { + sz, err := oracleSfDirSize(dir) + if err != nil { + t.Fatalf("producer %d sf_dir %q: walk failed: %v", p, dir, err) + } + if sz > capBytes { + t.Fatalf("producer %d sf_dir %q not purged after clean close: %d bytes (cap %d)", + p, dir, sz, capBytes) + } + } +} diff --git a/qwp_ingress_server_restart_fuzz_test.go b/qwp_ingress_server_restart_fuzz_test.go new file mode 100644 index 00000000..a3e67be5 --- /dev/null +++ b/qwp_ingress_server_restart_fuzz_test.go @@ -0,0 +1,704 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpIngressServerRestartFuzzTest. The contract +// being asserted (same as Java): +// +// Every row that the user thread successfully handed off to +// sender.Flush() (durable on disk inside sf_dir) must end up in the +// table after the server comes back, regardless of how many times +// the server bounces or whether the sender held its connection +// across the bounce. +// +// Server-side dedup is required: when an SF sender reconnects (or is +// replaced by a fresh sender pointed at the same sf_dir) it is free to +// resend any frame whose ACK was lost in the bounce. The target table +// is created with DEDUP UPSERT KEYS(ts, id) so replays collapse onto +// the original row. +// +// Versus the four QwpIngressOracle tests this file deliberately uses a +// simpler oracle (count + count_distinct(id)): the property under test +// here is "no row lost across server restarts / no row over-counted by +// replay", not per-cell type fidelity. The richer typed-cell oracle is +// already covered by qwp_ingress_oracle_fuzz_test.go. +// +// Each test that bounces the server skips when !srv.owns (the +// QDB_FUZZ_ADDR mode talks to a server we don't control and can't +// SIGTERM); the smoke-no-restart test runs in both modes. + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +const restartFuzzTableName = "qwp_restart_fuzz" + +const restartFuzzCreateSQL = "CREATE TABLE " + restartFuzzTableName + " (" + + "id LONG, val DOUBLE, ts TIMESTAMP) " + + "TIMESTAMP(ts) PARTITION BY DAY WAL " + + "DEDUP UPSERT KEYS(ts, id)" + +// restartFuzzSetup drops and re-creates the target table at the start +// of each test and registers a final cleanup drop. Mirrors Java's +// createTargetTable + assertMemoryLeak block setup. +func restartFuzzSetup(t *testing.T, srv *qwpFuzzServer) { + t.Helper() + srv.mustExec(t, "DROP TABLE IF EXISTS '"+restartFuzzTableName+"'") + t.Cleanup(func() { + _, _ = srv.execSQL("DROP TABLE IF EXISTS '" + restartFuzzTableName + "'") + }) + srv.mustExec(t, restartFuzzCreateSQL) +} + +// restartFuzzWriteRows pushes a deterministic (id, val, ts) sequence +// through the QWP sender: id ∈ [idBase, idBase+count), ts spaced 1µs +// apart from tsBaseNanos, val = id * 1.5. The QWP sender's column API +// is fluent through the typed methods so this mirrors Java's writeRows +// faithfully. The caller flushes. +// +// We do NOT call Flush here — Java relies on a final flush after the +// loop, and the QWP sender's auto_flush_rows can fire mid-loop. +func restartFuzzWriteRows(t *testing.T, qs QwpSender, idBase int64, count int, tsBaseNanos int64) { + t.Helper() + ctx := context.Background() + for i := 0; i < count; i++ { + id := idBase + int64(i) + ts := time.Unix(0, tsBaseNanos+int64(i)*1000).UTC() + qs.Table(restartFuzzTableName) + qs.Int64Column("id", id) + qs.Float64Column("val", float64(id)*1.5) + if err := qs.At(ctx, ts); err != nil { + t.Fatalf("write row id=%d: %v", id, err) + } + } +} + +// restartFuzzRunOneSender opens an SF sender at the given sf_dir, +// pushes count rows with the deterministic grid, flushes, and closes. +// An sf_dir is owned by exactly one sender at a time — callers MUST +// serialize senders that share a dir (across epochs). Faithful to +// Java's runOneSfSender. +func restartFuzzRunOneSender(t *testing.T, srv *qwpFuzzServer, sfDir string, + idBase int64, count int, tsBaseNanos int64) { + t.Helper() + ctx := context.Background() + conf := fmt.Sprintf("ws::addr=%s;sf_dir=%s;close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDir) + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + ls, err := LineSenderFromConf(octx, conf) + ocancel() + if err != nil { + t.Fatalf("open sender (sf_dir=%s): %v", sfDir, err) + } + qs := ls.(QwpSender) + restartFuzzWriteRows(t, qs, idBase, count, tsBaseNanos) + if err := qs.Flush(ctx); err != nil { + t.Fatalf("flush sender (sf_dir=%s): %v", sfDir, err) + } + cctx, ccancel := context.WithTimeout(context.Background(), 60*time.Second) + if err := qs.Close(cctx); err != nil { + ccancel() + t.Fatalf("close sender (sf_dir=%s): %v", sfDir, err) + } + ccancel() +} + +// restartFuzzAssertRowCount polls the table until count() reaches the +// expected value or the deadline elapses; matches WAL apply being +// asynchronous in QuestDB. Mirrors Java's assertRowCount + the +// engine.awaitTable wait pattern. The last execSQL error (if any) is +// surfaced on timeout so "server unreachable the whole window" is +// distinguishable from "WAL never caught up". +func restartFuzzAssertRowCount(t *testing.T, srv *qwpFuzzServer, expected int64, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + q := "SELECT count() FROM " + restartFuzzTableName + var lastN int64 + var lastErr error + for { + res, err := srv.execSQL(q) + if err != nil { + lastErr = err + } else if len(res.Dataset) == 1 && len(res.Dataset[0]) == 1 { + if n, ok := toInt64(res.Dataset[0][0]); ok { + lastN = n + if n == expected { + return + } + if n > expected { + t.Fatalf("row count overshoot: got %d expected %d", n, expected) + } + } + } + if time.Now().After(deadline) { + t.Fatalf("row count did not reach %d within %s (last seen %d, last execSQL err: %v)", + expected, timeout, lastN, lastErr) + } + time.Sleep(100 * time.Millisecond) + } +} + +// restartFuzzAssertDistinctIds verifies count() == count_distinct(id) +// and min/max id define the [0, expected) range exactly. Mirrors the +// SELECT count() c, count_distinct(id) d, min(id) lo, max(id) hi shape +// from Java's testSenderPushesContinuouslyWhileServerBounces. +func restartFuzzAssertDistinctIds(t *testing.T, srv *qwpFuzzServer, expected int64) { + t.Helper() + sql := "SELECT count(), count_distinct(id), min(id), max(id) FROM " + restartFuzzTableName + res, err := srv.execSQL(sql) + if err != nil { + t.Fatalf("distinct-id assert: %v", err) + } + if len(res.Dataset) != 1 || len(res.Dataset[0]) != 4 { + t.Fatalf("distinct-id assert: unexpected shape %+v", res.Dataset) + } + c, okC := toInt64(res.Dataset[0][0]) + d, okD := toInt64(res.Dataset[0][1]) + lo, okLo := toInt64(res.Dataset[0][2]) + hi, okHi := toInt64(res.Dataset[0][3]) + if !okC || !okD || !okLo || !okHi { + t.Fatalf("distinct-id assert: non-numeric cell in %+v", res.Dataset[0]) + } + if c != expected || d != expected || lo != 0 || hi != expected-1 { + t.Fatalf("distinct-id mismatch: want c=%d d=%d lo=0 hi=%d, got c=%d d=%d lo=%d hi=%d", + expected, expected, expected-1, c, d, lo, hi) + } +} + +// restartFuzzAssertValInvariant verifies the per-row payload invariant +// val == id * 1.5 holds for every row. Independent of any in-process +// counter: a bug that inflates both rowsProduced and the on-table id +// range in lockstep — so count(), count_distinct(id) and max(id) all +// agree with the producer's counter — would still trip this check if +// it corrupted, mis-associated, or split (id, val) pairs. Both writers +// (restartFuzzWriteRows and the continuous-bounces inline loop) encode +// val = float64(id) * 1.5; IEEE-754 binary64 makes the SQL comparison +// against id*1.5 exact for the row counts these tests reach (well under +// 2^53). +func restartFuzzAssertValInvariant(t *testing.T, srv *qwpFuzzServer) { + t.Helper() + sql := "SELECT count() FROM " + restartFuzzTableName + + " WHERE val <> id * 1.5" + res, err := srv.execSQL(sql) + if err != nil { + t.Fatalf("val-invariant assert: %v", err) + } + if len(res.Dataset) != 1 || len(res.Dataset[0]) != 1 { + t.Fatalf("val-invariant assert: unexpected shape %+v", res.Dataset) + } + n, ok := toInt64(res.Dataset[0][0]) + if !ok { + t.Fatalf("val-invariant assert: non-int count cell %+v", res.Dataset[0][0]) + } + if n != 0 { + // Surface a sample of the violators to make the failure actionable. + sample := "SELECT id, val FROM " + restartFuzzTableName + + " WHERE val <> id * 1.5 LIMIT 5" + sres, serr := srv.execSQL(sample) + t.Fatalf("val-invariant violated: %d rows where val != id*1.5 "+ + "(sample=%+v, sample err=%v)", n, sres.Dataset, serr) + } +} + +// restartFuzzAssertSegmentsOnDisk verifies that /default/ +// contains at least one .sfa segment file. Used after a paused-server +// fast close to confirm the on-disk durability invariant the next +// epoch's adoption / replay depends on — catches a close-time +// regression (premature unlink, panic mid-shutdown, a refactor that +// drops segment preservation) eagerly, before the end-of-test row +// count inherits the diagnosis. label is prefixed onto any failure +// message so multi-epoch callers can locate which call site fired. +func restartFuzzAssertSegmentsOnDisk(t *testing.T, sfDir, label string) { + t.Helper() + slotDir := filepath.Join(sfDir, qwpSfDefaultSenderId) + entries, err := os.ReadDir(slotDir) + if err != nil { + t.Fatalf("%s: read slot dir %s: %v", label, slotDir, err) + } + var sfaFiles []string + var allNames []string + for _, e := range entries { + allNames = append(allNames, e.Name()) + if strings.HasSuffix(e.Name(), ".sfa") { + sfaFiles = append(sfaFiles, e.Name()) + } + } + if len(sfaFiles) == 0 { + t.Fatalf("%s: expected at least one .sfa segment in %s for next-epoch replay, got entries %v", + label, slotDir, allNames) + } +} + +// --- entry points ------------------------------------------------- + +// TestQwpFuzzIngressServerRestartSmokeNoRestart — port of Java +// testSmokeNoRestart. Wire-path control: N parallel writers, each +// with its own sf_dir, push rows without a server bounce. Verifies +// the happy-path SF send loop in isolation from any restart logic. +// Runs in both fixture-launched AND QDB_FUZZ_ADDR mode (no restart +// involved). +func TestQwpFuzzIngressServerRestartSmokeNoRestart(t *testing.T) { + srv := fuzzServer(t) + restartFuzzSetup(t, srv) + + const ( + writers = 2 + rowsPerWriter = 500 + ) + baseTsNanos := int64(1_700_000_000_000_000_000) + + var wg sync.WaitGroup + errs := make([]error, writers) + for w := 0; w < writers; w++ { + w := w + wg.Add(1) + go func() { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[w] = fmt.Errorf("writer %d panic: %v", w, rec) + } + }() + sfDir := t.TempDir() + idBase := int64(w) * rowsPerWriter + tsBase := baseTsNanos + int64(w)*rowsPerWriter*1000 + restartFuzzRunOneSender(t, srv, sfDir, idBase, rowsPerWriter, tsBase) + }() + } + wg.Wait() + for w, e := range errs { + if e != nil { + t.Fatalf("writer %d: %v", w, e) + } + } + restartFuzzAssertRowCount(t, srv, int64(writers*rowsPerWriter), 60*time.Second) + restartFuzzAssertValInvariant(t, srv) +} + +// TestQwpFuzzIngressServerRestartNewSenderRecoversFromSfDir — port of +// Java testNewSenderRecoversFromSfDir. Two epochs, same sf_dir slot. +// Epoch 1 uses close_flush_timeout_millis=0 (fast close) and pauses +// the server BEFORE the sender exits, leaving unacked frames on disk +// in /default/. Epoch 2 brings the server back on the same +// port, opens a new sender at the same sf_dir, and the slot adopts + +// replays the leftovers before pushing its own new rows. Dedup on +// (ts, id) collapses any wire-level replays. +// +// Skips in QDB_FUZZ_ADDR mode (can't pause an external server). +func TestQwpFuzzIngressServerRestartNewSenderRecoversFromSfDir(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("requires fixture-launched server (cannot pause QDB_FUZZ_ADDR target)") + } + restartFuzzSetup(t, srv) + t.Cleanup(func() { + // Ensure the server is up for subsequent tests no matter how we exit. + _ = srv.start() + }) + + sfDir := t.TempDir() + const rowsPerEpoch = 5_000 + baseTsNanos := int64(1_700_000_000_000_000_000) + ctx := context.Background() + + // --- Epoch 1: write, pause server BEFORE sender close --- + conf1 := fmt.Sprintf("ws::addr=%s;sf_dir=%s;close_flush_timeout_millis=0;", + srv.wsAddr(), sfDir) + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + ls1, err := LineSenderFromConf(octx, conf1) + ocancel() + if err != nil { + t.Fatalf("epoch 1 open: %v", err) + } + qs1 := ls1.(QwpSender) + restartFuzzWriteRows(t, qs1, 0, rowsPerEpoch, baseTsNanos) + publishedFsn, err := qs1.FlushAndGetSequence(ctx) + if err != nil { + t.Fatalf("epoch 1 flush: %v", err) + } + // Kill BEFORE close so genuinely-unacked frames remain on disk. + // srv.kill() (SIGKILL) is used rather than pause() (SIGTERM): a + // graceful JVM shutdown lets the worker pool flush every queued + // ACK before exit, which in practice always full-drains the + // 5000-row batch and skips the disk-durability code path. SIGKILL + // blocks until the process is reaped, so ackedFsn is stable from + // this point: no further ACKs can reach the loop. + srv.kill() + ackedBeforeClose := qs1.AckedFsn() + cctx, ccancel := context.WithTimeout(context.Background(), 10*time.Second) + if err := qs1.Close(cctx); err != nil { + // Fast close is best-effort here — sender is disconnected, + // frames are durable on disk for epoch 2 to replay. + t.Logf("epoch 1 close (expected disconnect): %v", err) + } + ccancel() + // Durability invariant: any frame published but not acked at the + // time Close ran must survive on disk for epoch 2 to adopt. With + // kill() above, the expected outcome is ackedBeforeClose < + // publishedFsn — the disk-assert branch fires and catches a + // close-time unlink regression eagerly. The else branch only + // exists as defense against a rare path where in-flight ACKs were + // already in the client's OS receive buffer at kill time and the + // send loop drained them before our snapshot; in that case the + // end-of-test row count still covers the property. + if ackedBeforeClose < publishedFsn { + restartFuzzAssertSegmentsOnDisk(t, sfDir, "epoch 1") + } else { + t.Logf("epoch 1: full drain raced ahead of kill (publishedFsn=%d, ackedBeforeClose=%d) — skipping eager disk check", + publishedFsn, ackedBeforeClose) + } + + // --- Epoch 2: server back on the same port, new sender adopts --- + if err := srv.start(); err != nil { + t.Fatalf("epoch 2 start server: %v", err) + } + restartFuzzRunOneSender(t, srv, sfDir, + rowsPerEpoch, rowsPerEpoch, + baseTsNanos+int64(rowsPerEpoch)*1000) + restartFuzzAssertRowCount(t, srv, 2*rowsPerEpoch, 90*time.Second) + restartFuzzAssertValInvariant(t, srv) +} + +// TestQwpFuzzIngressServerRestartSameSenderSurvives — port of Java +// testSameSenderSurvivesServerRestart. One long-lived sender across +// a single server bounce. Phase 1 writes rowsPerPhase rows + flush, +// then we bounce the server, then Phase 2 writes the next slice on +// the SAME sender. The QWP sender's I/O loop must transparently +// reconnect; the user thread never sees the disconnect. +// +// Skips in QDB_FUZZ_ADDR mode. +func TestQwpFuzzIngressServerRestartSameSenderSurvives(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("requires fixture-launched server (cannot bounce QDB_FUZZ_ADDR target)") + } + restartFuzzSetup(t, srv) + t.Cleanup(func() { _ = srv.start() }) + + sfDir := t.TempDir() + const rowsPerPhase = 500 + baseTsNanos := int64(1_700_000_000_000_000_000) + ctx := context.Background() + + conf := fmt.Sprintf("ws::addr=%s;sf_dir=%s;"+ + "reconnect_max_duration_millis=120000;"+ + "close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDir) + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + ls, err := LineSenderFromConf(octx, conf) + ocancel() + if err != nil { + t.Fatalf("open sender: %v", err) + } + qs := ls.(QwpSender) + defer func() { + dctx, dcancel := context.WithTimeout(context.Background(), 60*time.Second) + _ = qs.Close(dctx) + dcancel() + }() + + // Phase 1. + restartFuzzWriteRows(t, qs, 0, rowsPerPhase, baseTsNanos) + if err := qs.Flush(ctx); err != nil { + t.Fatalf("phase 1 flush: %v", err) + } + + // Bounce the server. + if err := srv.bounce(); err != nil { + t.Fatalf("bounce: %v", err) + } + + // Phase 2 — same sender, must reconnect transparently. + restartFuzzWriteRows(t, qs, rowsPerPhase, rowsPerPhase, + baseTsNanos+int64(rowsPerPhase)*1000) + if err := qs.Flush(ctx); err != nil { + t.Fatalf("phase 2 flush: %v", err) + } + restartFuzzAssertRowCount(t, srv, 2*rowsPerPhase, 90*time.Second) + restartFuzzAssertValInvariant(t, srv) +} + +// TestQwpFuzzIngressServerRestartMultipleRestartsNewSender — port of +// Java testFuzzMultipleRestartsNewSender. Multi-epoch loop with a new +// sender per epoch and the server killed BEFORE each sender exits; +// every leftover frame stays on disk and is replayed by the next +// epoch's sender via the shared sf_dir slot. Final epoch is a +// drain-only sender on the now-stable server, with the default +// (long) close timeout so any residual replay completes. +// +// Skips in QDB_FUZZ_ADDR mode. +func TestQwpFuzzIngressServerRestartMultipleRestartsNewSender(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("requires fixture-launched server (cannot kill QDB_FUZZ_ADDR target)") + } + restartFuzzSetup(t, srv) + t.Cleanup(func() { _ = srv.start() }) + + r := newFuzzRand(t) + sfDir := t.TempDir() + epochs := 3 + r.Intn(3) // 3..5 + rowsPerEpoch := 500 + r.Intn(1500) // 500..1999 + baseTsNanos := int64(1_700_000_000_000_000_000) + + var totalRows, idBase int64 + ctx := context.Background() + + for epoch := 0; epoch < epochs; epoch++ { + t.Logf("epoch %d/%d rows=%d idBase=%d", epoch+1, epochs, rowsPerEpoch, idBase) + // Server must be up at the start of each epoch. + if err := srv.start(); err != nil { + t.Fatalf("epoch %d start: %v", epoch, err) + } + conf := fmt.Sprintf("ws::addr=%s;sf_dir=%s;close_flush_timeout_millis=0;", + srv.wsAddr(), sfDir) + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + ls, err := LineSenderFromConf(octx, conf) + ocancel() + if err != nil { + t.Fatalf("epoch %d open: %v", epoch, err) + } + qs := ls.(QwpSender) + restartFuzzWriteRows(t, qs, idBase, rowsPerEpoch, + baseTsNanos+idBase*1000) + publishedFsn, err := qs.FlushAndGetSequence(ctx) + if err != nil { + t.Fatalf("epoch %d flush: %v", epoch, err) + } + // Random pause: sometimes the server drains everything, + // sometimes not. + time.Sleep(time.Duration(r.Intn(50)) * time.Millisecond) + // Kill server BEFORE sender exits → unacked frames stay on + // disk. SIGKILL rather than the graceful SIGTERM (pause) so + // the JVM cannot flush queued ACKs through its shutdown hooks + // and full-drain the batch; blocks until the process is + // reaped, so ackedFsn is stable from this point. + srv.kill() + ackedBeforeClose := qs.AckedFsn() + cctx, ccancel := context.WithTimeout(context.Background(), 10*time.Second) + if err := qs.Close(cctx); err != nil { + // Fast close best-effort across the disconnect. + t.Logf("epoch %d close (expected disconnect): %v", epoch, err) + } + ccancel() + // Durability invariant: when at least one frame was unacked + // at Close time, the slot's .sfa files must survive so the + // next epoch can adopt and replay. With kill() above, the + // expected outcome each epoch is ackedBeforeClose < + // publishedFsn — the disk-assert branch fires. The random + // sleep before kill spreads the unacked count across the + // 500-1999-row range, so different epochs exercise different + // partial-drain depths. The else branch is defensive for the + // rare OS-buffered-ACK race; the end-of-test row count keeps + // coverage there. + if ackedBeforeClose < publishedFsn { + restartFuzzAssertSegmentsOnDisk(t, sfDir, + fmt.Sprintf("epoch %d", epoch)) + } else { + t.Logf("epoch %d: full drain raced ahead of kill (publishedFsn=%d, ackedBeforeClose=%d) — skipping eager disk check", + epoch, publishedFsn, ackedBeforeClose) + } + totalRows += int64(rowsPerEpoch) + idBase += int64(rowsPerEpoch) + } + + // Final epoch: server up, default long close timeout so the drain + // sender replays any leftover unacked frames and waits for ACKs. + if err := srv.start(); err != nil { + t.Fatalf("final start: %v", err) + } + confFinal := fmt.Sprintf("ws::addr=%s;sf_dir=%s;close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDir) + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + lsFinal, err := LineSenderFromConf(octx, confFinal) + ocancel() + if err != nil { + t.Fatalf("final open: %v", err) + } + qsFinal := lsFinal.(QwpSender) + if err := qsFinal.Flush(ctx); err != nil { + t.Fatalf("final flush: %v", err) + } + cctx, ccancel := context.WithTimeout(context.Background(), 180*time.Second) + if err := qsFinal.Close(cctx); err != nil { + ccancel() + t.Fatalf("final close: %v", err) + } + ccancel() + restartFuzzAssertRowCount(t, srv, totalRows, 180*time.Second) + restartFuzzAssertValInvariant(t, srv) +} + +// TestQwpFuzzIngressServerRestartContinuousBounces — port of Java +// testSenderPushesContinuouslyWhileServerBounces. The realistic +// outage scenario: one user thread writes rows continuously through +// a single long-lived SF sender while a sibling goroutine bounces the +// server 3-5 times. Producer must not surface a failure to its caller, +// and after Close every row that was handed to At(...) must be present +// exactly once (count == count_distinct(id), ids exactly [0, n)). +// +// Skips in QDB_FUZZ_ADDR mode. The Go fixture's bounce is ~500ms +// (SIGTERM + ready-poll) versus Java's 30-79ms downtime; the long +// reconnect/close timeouts cover both. +func TestQwpFuzzIngressServerRestartContinuousBounces(t *testing.T) { + srv := fuzzServer(t) + if !srv.owns { + t.Skip("requires fixture-launched server (cannot bounce QDB_FUZZ_ADDR target)") + } + restartFuzzSetup(t, srv) + t.Cleanup(func() { _ = srv.start() }) + + r := newFuzzRand(t) + sfDir := t.TempDir() + bounces := 3 + r.Intn(3) // 3..5 + const ( + batchRows = 25 + batchPauseMillis = 2 + ) + baseTsNanos := int64(1_700_000_000_000_000_000) + tsStepNanos := int64(1000) // 1µs per row + + conf := fmt.Sprintf("ws::addr=%s;sf_dir=%s;"+ + "reconnect_max_duration_millis=120000;"+ + "close_flush_timeout_millis=120000;", + srv.wsAddr(), sfDir) + + var stopProducer atomic.Bool + var producerErr atomic.Value + var bouncerErr atomic.Value + var rowsProduced atomic.Int64 + + producerDone := make(chan struct{}) + bouncerDone := make(chan struct{}) + + go func() { + defer close(producerDone) + ctx := context.Background() + octx, ocancel := context.WithTimeout(context.Background(), 15*time.Second) + ls, err := LineSenderFromConf(octx, conf) + ocancel() + if err != nil { + producerErr.Store(fmt.Errorf("open: %w", err)) + return + } + qs := ls.(QwpSender) + defer func() { + cctx, ccancel := context.WithTimeout(context.Background(), 180*time.Second) + if err := qs.Close(cctx); err != nil { + producerErr.Store(fmt.Errorf("close: %w", err)) + } + ccancel() + }() + var id int64 + for !stopProducer.Load() { + for i := 0; i < batchRows; i++ { + currentId := id + id++ + ts := time.Unix(0, baseTsNanos+currentId*tsStepNanos).UTC() + qs.Table(restartFuzzTableName) + qs.Int64Column("id", currentId) + qs.Float64Column("val", float64(currentId)*1.5) + if err := qs.At(ctx, ts); err != nil { + producerErr.Store(fmt.Errorf("at id=%d: %w", currentId, err)) + return + } + } + // Publish what we just buffered into the SF cursor so a + // bounce mid-batch can't lose rows still sitting in the + // client auto-flush buffer. + if err := qs.Flush(ctx); err != nil { + producerErr.Store(fmt.Errorf("flush@id=%d: %w", id, err)) + return + } + rowsProduced.Store(id) + time.Sleep(batchPauseMillis * time.Millisecond) + } + }() + + go func() { + defer close(bouncerDone) + // Let the producer get into a steady-state rhythm before the + // first bounce so we exercise the mid-flight reconnect path + // rather than first-connect. + time.Sleep(100 * time.Millisecond) + for i := 0; i < bounces; i++ { + t.Logf("bounce %d/%d", i+1, bounces) + srv.pause() + // Java sleeps 30-79ms here; the Go fixture's start() polls + // /ping so the effective downtime is longer regardless. + time.Sleep(time.Duration(30+r.Intn(50)) * time.Millisecond) + if err := srv.start(); err != nil { + bouncerErr.Store(fmt.Errorf("bounce %d start: %w", i+1, err)) + return + } + time.Sleep(time.Duration(120+r.Intn(200)) * time.Millisecond) + } + }() + + select { + case <-bouncerDone: + case <-time.After(180 * time.Second): + stopProducer.Store(true) + t.Fatalf("bouncer did not finish within 180s") + } + if v := bouncerErr.Load(); v != nil { + stopProducer.Store(true) + t.Fatalf("bouncer: %v", v) + } + + // Grace window: a few more producer batches against the now-stable + // server, then signal stop and wait for the producer to drain. + time.Sleep(200 * time.Millisecond) + stopProducer.Store(true) + + select { + case <-producerDone: + case <-time.After(240 * time.Second): + t.Fatalf("producer did not finish within 240s (rowsProduced=%d)", + rowsProduced.Load()) + } + if v := producerErr.Load(); v != nil { + t.Fatalf("producer (rowsProduced=%d): %v", rowsProduced.Load(), v) + } + + expected := rowsProduced.Load() + if expected <= 0 { + t.Fatalf("producer wrote zero rows") + } + t.Logf("producer wrote %d rows across %d server bounces", expected, bounces) + restartFuzzAssertRowCount(t, srv, expected, 180*time.Second) + restartFuzzAssertDistinctIds(t, srv, expected) + restartFuzzAssertValInvariant(t, srv) +} diff --git a/qwp_integration_test.go b/qwp_integration_test.go index 2f709de0..b03f1a74 100644 --- a/qwp_integration_test.go +++ b/qwp_integration_test.go @@ -39,11 +39,19 @@ import ( ) const ( - qwpTestAddr = "localhost:9000" qwpTestWaitPeriod = 5 * time.Second qwpTestPollPeriod = 100 * time.Millisecond ) +// qwpTestAddr is the host:port the QWP integration tests target. It +// used to be a const pinned to localhost:9000 (a developer's live +// server), which caused these tests to silently skip in CI where no +// such server runs. qwpEnsureServer now boots the shared fuzz +// fixture and writes the fixture's address here, so the same tests +// run against a real QuestDB under qwp-fuzz.yml (and any QDB_FUZZ_ADDR +// the developer points at on their machine, including localhost:9000). +var qwpTestAddr string + var qwpTestHTTPClient = &http.Client{Timeout: qwpTestWaitPeriod} // qwpTableResult holds query results from QuestDB's /exec endpoint. @@ -59,22 +67,32 @@ type qwpColumnInfo struct { Type string `json:"type"` } -// qwpSkipIfNoServer skips the test if QuestDB is not available. -func qwpSkipIfNoServer(t *testing.T) { +// qwpEnsureServer ensures a real QuestDB is reachable for the +// caller's integration test and writes its host:port into the +// package-level qwpTestAddr. +// +// Resolution policy (matches the fuzz fixture): +// 1. QDB_FUZZ_ADDR — talk to an externally-managed server (a +// developer's live localhost:9000, or a long-lived CI box). +// 2. Otherwise boot a private QuestDB JVM from a QDB_JAR / QDB_REPO +// / sibling questdb checkout. Auto-runs under qwp-fuzz.yml. +// 3. If neither resolves, t.Skip (unless QDB_FUZZ_STRICT=1, in which +// case t.Fatal so CI loudly fails instead of silently passing). +// +// As a side effect the caller's subsequent qwpQuery / qwpDropTable / +// "ws://"+qwpTestAddr connect strings all target the resolved server. +func qwpEnsureServer(t *testing.T) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 0, 0, 0, nil) - if err != nil { - t.Skipf("QuestDB not available at %s: %v", qwpTestAddr, err) - } - s.Close(ctx) + srv := fuzzServer(t) + qwpTestAddr = srv.wsAddr() } // qwpDropTable drops a table via QuestDB's HTTP API. func qwpDropTable(t *testing.T, tableName string) { t.Helper() + if qwpTestAddr == "" { + t.Fatal("qwpDropTable called before qwpEnsureServer — qwpTestAddr is empty") + } u, _ := url.Parse("http://" + qwpTestAddr) u.Path = "/exec" params := url.Values{} @@ -143,14 +161,14 @@ func qwpWaitForRows(t *testing.T, tableName string, expectedRows int) qwpTableRe // --- Basic integration test --- func TestQwpIntegrationBasicTypes(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_basic_types" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -214,14 +232,14 @@ func TestQwpIntegrationBasicTypes(t *testing.T) { // --- Multi-row, multi-flush test --- func TestQwpIntegrationMultipleFlushes(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_multi_flush" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -268,14 +286,14 @@ func TestQwpIntegrationMultipleFlushes(t *testing.T) { // --- Symbol deduplication test --- func TestQwpIntegrationSymbolDedup(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_symbol_dedup" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -316,7 +334,7 @@ func TestQwpIntegrationSymbolDedup(t *testing.T) { // --- Multi-table batch test --- func TestQwpIntegrationMultiTable(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() table1 := "qwp_integ_multi_t1" @@ -326,7 +344,7 @@ func TestQwpIntegrationMultiTable(t *testing.T) { defer qwpDropTable(t, table1) defer qwpDropTable(t, table2) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -365,14 +383,14 @@ func TestQwpIntegrationMultiTable(t *testing.T) { // --- Large batch test --- func TestQwpIntegrationLargeBatch(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_large_batch" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -407,14 +425,16 @@ func TestQwpIntegrationLargeBatch(t *testing.T) { // --- Config string creation test --- func TestQwpIntegrationFromConf(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_from_conf" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - confStr := fmt.Sprintf("ws::addr=%s;auto_flush=off;retry_timeout=1000;", qwpTestAddr) + // retry_timeout is HTTP-only; QWP uses reconnect_max_duration_millis + // for the per-outage budget (see the connect-string audit). + confStr := fmt.Sprintf("ws::addr=%s;auto_flush=off;reconnect_max_duration_millis=1000;", qwpTestAddr) sender, err := LineSenderFromConf(ctx, confStr) if err != nil { t.Fatalf("LineSenderFromConf: %v", err) @@ -446,7 +466,7 @@ func TestQwpIntegrationFromConf(t *testing.T) { // --- Async mode integration test --- func TestQwpIntegrationAsyncMode(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_async" @@ -454,14 +474,14 @@ func TestQwpIntegrationAsyncMode(t *testing.T) { defer qwpDropTable(t, tableName) // Create sender with in-flight window = 4. - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 0, 0, nil, 4) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 4) if err != nil { t.Fatal(err) } defer s.Close(ctx) - if s.asyncState == nil { - t.Fatal("expected async mode with window=4") + if s.cursorEngine == nil || s.cursorSendLoop == nil { + t.Fatal("expected cursor engine + send loop to be wired") } const rowCount = 1000 @@ -492,7 +512,7 @@ func TestQwpIntegrationAsyncMode(t *testing.T) { // --- Async mode via config string --- func TestQwpIntegrationAsyncFromConf(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_async_conf" @@ -532,7 +552,7 @@ func TestQwpIntegrationAsyncFromConf(t *testing.T) { // --- Auto-flush integration test --- func TestQwpIntegrationAutoFlush(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_autoflush" @@ -540,7 +560,7 @@ func TestQwpIntegrationAutoFlush(t *testing.T) { defer qwpDropTable(t, tableName) // auto-flush every 3 rows. - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 3, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 3, 0, nil) if err != nil { t.Fatal(err) } @@ -581,14 +601,14 @@ func TestQwpIntegrationAutoFlush(t *testing.T) { // sent via QWP, and stored in QuestDB. This test validates the // Phase 13 null-packing fix against the real server. func TestQwpIntegrationNullableColumns(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_nullable" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -735,14 +755,14 @@ func TestQwpIntegrationNullableColumns(t *testing.T) { // --- Long256 round-trip --- func TestQwpIntegrationLong256(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_long256" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -817,14 +837,14 @@ func TestQwpIntegrationLong256(t *testing.T) { // server fills it in at receive time. Success means the row lands and // the ts column is populated. func TestQwpIntegrationAtNow(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_at_now" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -855,7 +875,7 @@ func TestQwpIntegrationAtNow(t *testing.T) { // per type rather than once for the suite. func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -864,7 +884,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) s.Table(tableName) @@ -888,7 +908,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) s.Table(tableName) @@ -911,7 +931,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) s.Table(tableName) @@ -934,7 +954,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) s.Table(tableName) @@ -958,7 +978,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) s.Table(tableName) @@ -981,7 +1001,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11 — borrowed from the @@ -1010,7 +1030,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // Whole second so there's no sub-millisecond noise to worry about. @@ -1040,7 +1060,7 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // Use a nanosecond-precision designated timestamp (AtNano). The @@ -1079,14 +1099,14 @@ func TestQwpIntegrationQwpOnlyTypes(t *testing.T) { // Round-trips Decimal64/128/256 with distinct scales so QuestDB // auto-creates columns typed to each of the three fixed widths. func TestQwpIntegrationDecimalColumns(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_decimal" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // Mirror the Java port's Decimal{64,128,256}.fromLong(unscaled, scale) @@ -1140,14 +1160,14 @@ func TestQwpIntegrationDecimalColumns(t *testing.T) { // encoding and the same buffer path (qwpColumnBuffer.addDoubleArray), // so one dimension exercises the full stack end-to-end. func TestQwpIntegrationFloat64Arrays(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_f64_array" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1191,7 +1211,7 @@ func TestQwpIntegrationFloat64Arrays(t *testing.T) { // precision must be fixed in the schema. Mirroring the Java test, // pre-create the table with GEOHASH(8c) = 40 bits. func TestQwpIntegrationGeohash(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_geohash" @@ -1202,7 +1222,7 @@ func TestQwpIntegrationGeohash(t *testing.T) { "CREATE TABLE '%s' (gh GEOHASH(8c), ts TIMESTAMP) TIMESTAMP(ts) PARTITION BY DAY WAL", tableName)) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // Any 40-bit pattern round-trips as long as the client's wire @@ -1273,14 +1293,14 @@ func qwpExec(t *testing.T, query string) { // nullable column reports null for the omitted rows while non-nullable // types (BYTE, SHORT, BOOL) fall back to their type-specific sentinel. func TestQwpIntegrationOmittedColumns(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_omitted" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1411,14 +1431,17 @@ func TestQwpIntegrationOmittedColumns(t *testing.T) { } } -// newOrSkip constructs a QWP sender for the integration suite or skips -// the test on connection failure. Used by per-subtest helpers so each -// subtest gets its own sender. -func newOrSkip(t *testing.T, ctx context.Context) QwpSender { +// newQwpIntegSender constructs a QWP sender for the integration suite. +// Used by per-subtest helpers so each subtest gets its own sender. +// Callers must have already run qwpEnsureServer(t), so a connect +// failure here is a real sender bug, not a deployment gap — Fatalf +// (also makes QDB_FUZZ_STRICT=1 in qwp-fuzz.yml fail loudly instead of +// silently passing). +func newQwpIntegSender(t *testing.T, ctx context.Context) QwpSender { t.Helper() - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { - t.Skipf("QuestDB not available at %s: %v", qwpTestAddr, err) + t.Fatalf("connect ws://%s: %v", qwpTestAddr, err) } return s } @@ -1431,14 +1454,14 @@ func newOrSkip(t *testing.T, ctx context.Context) QwpSender { // Verifies the encoder assembles a single table block with diverse // column types and the server ingests it without coercion errors. func TestQwpIntegrationWriteAllTypesInOneRow(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_all_types" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2022, 2, 25, 0, 0, 0, 0, time.UTC) @@ -1488,7 +1511,7 @@ func TestQwpIntegrationWriteAllTypesInOneRow(t *testing.T) { // contaminate (i.e. send a LONG payload under the DOUBLE schema or // vice versa). func TestQwpIntegrationSchemaIsolation(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableA := "qwp_integ_iso_a" @@ -1498,7 +1521,7 @@ func TestQwpIntegrationSchemaIsolation(t *testing.T) { defer qwpDropTable(t, tableA) defer qwpDropTable(t, tableB) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1583,14 +1606,14 @@ func columnType(t *testing.T, tableName, column string) string { // Java test pre-creates a bare table and verifies StringColumn adds // a VARCHAR column. func TestQwpIntegrationAutoCreateVarcharColumn(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_auto_varchar" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1618,12 +1641,12 @@ func TestQwpIntegrationAutoCreateVarcharColumn(t *testing.T) { // exercise the buffer path, so this is an integration test, not a // pure unit test. func TestQwpIntegrationNameValidation(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) t.Run("EmptyTableName", func(t *testing.T) { - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) err := s.Table("").Int64Column("v", 1).At(ctx, ts) if err == nil { @@ -1639,7 +1662,7 @@ func TestQwpIntegrationNameValidation(t *testing.T) { qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) err := s.Table(tableName).Int64Column("", 42).At(ctx, ts) if err == nil { @@ -1658,14 +1681,14 @@ func TestQwpIntegrationNameValidation(t *testing.T) { // multi-dimensional branch that the 1D test does not. func TestQwpIntegrationFloat64Array2D(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_f64_array_2d" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1696,14 +1719,14 @@ func TestQwpIntegrationFloat64Array2D(t *testing.T) { } func TestQwpIntegrationFloat64Array3D(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_f64_array_3d" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1740,14 +1763,14 @@ func TestQwpIntegrationFloat64Array3D(t *testing.T) { // entry point goes through the websocket handshake. func TestQwpIntegrationDecimalScaleConflict(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_decimal_scale_conflict" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1773,14 +1796,14 @@ func TestQwpIntegrationDecimalScaleConflict(t *testing.T) { } func TestQwpIntegrationColumnTypeConflict(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_type_conflict" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1802,14 +1825,14 @@ func TestQwpIntegrationColumnTypeConflict(t *testing.T) { } func TestQwpIntegrationDuplicateColumnInRow(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_dup_col" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1828,7 +1851,7 @@ func TestQwpIntegrationDuplicateColumnInRow(t *testing.T) { } func TestQwpIntegrationGeohashPrecisionConflict(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_geohash_prec_conflict" @@ -1841,7 +1864,7 @@ func TestQwpIntegrationGeohashPrecisionConflict(t *testing.T) { "CREATE TABLE '%s' (g GEOHASH(8c), ts TIMESTAMP) TIMESTAMP(ts) PARTITION BY DAY WAL", tableName)) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) ts := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1872,7 +1895,7 @@ func TestQwpIntegrationGeohashPrecisionConflict(t *testing.T) { // rows and wait for ACKs before returning. A buggy Close that only // cancels the goroutine without flushing would silently drop data. func TestQwpIntegrationAsyncCloseFlushes(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_async_close_flushes" @@ -1880,9 +1903,9 @@ func TestQwpIntegrationAsyncCloseFlushes(t *testing.T) { defer qwpDropTable(t, tableName) // Async sender (in-flight window = 4). No explicit Flush. - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 0, 0, nil, 4) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 4) if err != nil { - t.Skipf("connect: %v", err) + t.Fatalf("connect: %v", err) } base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -1915,7 +1938,7 @@ func TestQwpIntegrationAsyncCloseFlushes(t *testing.T) { // Java uses 200 rows with autoFlushRows=2 (100 batches) — scaled to // 100 rows / 50 batches here. func TestQwpIntegrationAsyncStressAcks(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_async_stress_acks" @@ -1924,9 +1947,9 @@ func TestQwpIntegrationAsyncStressAcks(t *testing.T) { // autoFlushRows=2 → 50 batches in flight for 100 rows, with the // default in-flight window the sender must recycle buffers via ACKs. - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 2, 0, nil, 4) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 2, 0, nil, 4) if err != nil { - t.Skipf("connect: %v", err) + t.Fatalf("connect: %v", err) } defer s.Close(ctx) @@ -1956,7 +1979,7 @@ func TestQwpIntegrationAsyncStressAcks(t *testing.T) { // per-table buffers and emits one multi-table message per flush; // losing a table mid-batch would drop rows silently. func TestQwpIntegrationAsyncMultiTable(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableA := "qwp_integ_async_multi_a" @@ -1966,9 +1989,9 @@ func TestQwpIntegrationAsyncMultiTable(t *testing.T) { defer qwpDropTable(t, tableA) defer qwpDropTable(t, tableB) - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 0, 0, nil, 4) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 4) if err != nil { - t.Skipf("connect: %v", err) + t.Fatalf("connect: %v", err) } defer s.Close(ctx) @@ -2007,7 +2030,7 @@ func TestQwpIntegrationAsyncMultiTable(t *testing.T) { // batch every N rows without waiting for explicit Flush(). A bug in // the row-count trigger would either stall the sender or over-flush. func TestQwpIntegrationAsyncRowBasedFlush(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_async_row_flush" @@ -2015,9 +2038,9 @@ func TestQwpIntegrationAsyncRowBasedFlush(t *testing.T) { defer qwpDropTable(t, tableName) // autoFlushRows=10, so 50 rows → 5 automatic flushes in async mode. - s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 10, 0, nil, 4) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 10, 0, nil, 4) if err != nil { - t.Skipf("connect: %v", err) + t.Fatalf("connect: %v", err) } defer s.Close(ctx) @@ -2052,7 +2075,7 @@ func TestQwpIntegrationAsyncRowBasedFlush(t *testing.T) { // client (e.g. a symbol map without per-sender scoping) would corrupt // ingestion under concurrency. func TestQwpIntegrationConcurrentSenders(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() t.Run("DifferentTables", func(t *testing.T) { @@ -2073,7 +2096,7 @@ func TestQwpIntegrationConcurrentSenders(t *testing.T) { for s := 0; s < senderCount; s++ { go func(idx int) { defer wg.Done() - sender, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 10, 0, nil, 4) + sender, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 10, 0, nil, 4) if err != nil { errs <- fmt.Errorf("sender %d connect: %w", idx, err) return @@ -2124,7 +2147,7 @@ func TestQwpIntegrationConcurrentSenders(t *testing.T) { for s := 0; s < senderCount; s++ { go func(idx int) { defer wg.Done() - sender, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{}, 5*time.Second, 10, 0, nil, 4) + sender, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}, 10, 0, nil, 4) if err != nil { errs <- fmt.Errorf("sender %d connect: %w", idx, err) return @@ -2182,14 +2205,14 @@ func TestQwpIntegrationConcurrentSenders(t *testing.T) { // timestamps cycle through every Gorilla bucket (0-bit, 7-bit, 9-bit, // 12-bit, 32-bit) and verifies exact per-row round-trip. func TestQwpIntegrationGorillaTimestampRoundTrip(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_gorilla_ts" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) // Build 128 timestamps whose DoDs span all five Gorilla buckets. @@ -2243,20 +2266,22 @@ func TestQwpIntegrationGorillaTimestampRoundTrip(t *testing.T) { // --- Client-focused schema evolution test --- // -// When the user adds a new column mid-session, the client must reset -// the table's schemaId so the next flush re-registers the expanded -// schema in FULL mode (not REFERENCE mode against the stale ID). -// Otherwise the server would decode subsequent rows against the wrong -// column set and either reject them or mis-map columns. +// When the user adds a new column mid-session, the next flush must +// carry the expanded column set inline so the server decodes +// subsequent rows against the right columns. The cursor encoder +// emits FULL schema mode on every frame with schema_id=0, so +// schema evolution is "just" re-sending the new column list — +// there is no per-table invalidation step to verify on the client +// side. func TestQwpIntegrationSchemaEvolution(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_schema_evolution" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -2275,8 +2300,9 @@ func TestQwpIntegrationSchemaEvolution(t *testing.T) { t.Fatalf("phase1 Flush: %v", err) } - // Phase 2: 5 rows with an added column c. The client must reset - // the schemaId so a new FULL-mode schema is registered. + // Phase 2: 5 rows with an added column c. The client emits a + // FULL schema block carrying the new column on every frame, + // so no per-table invalidation is needed. for i := 5; i < 10; i++ { if err := s.Table(tableName). Int64Column("a", int64(i)). @@ -2329,14 +2355,14 @@ func TestQwpIntegrationSchemaEvolution(t *testing.T) { // flushes. This writes 24 rows of alternating true/false across 3 // bytes to cover two byte boundaries. func TestQwpIntegrationBoolBitPacking(t *testing.T) { - qwpSkipIfNoServer(t) + qwpEnsureServer(t) ctx := context.Background() tableName := "qwp_integ_bool_packing" qwpDropTable(t, tableName) defer qwpDropTable(t, tableName) - s := newOrSkip(t, ctx) + s := newQwpIntegSender(t, ctx) defer s.Close(ctx) base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) @@ -2374,3 +2400,45 @@ func TestQwpIntegrationBoolBitPacking(t *testing.T) { } } } + +func TestQwpIntegrationConnect(t *testing.T) { + qwpEnsureServer(t) + ctx := context.Background() + + var tr qwpTransport + err := tr.connect(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{endpointPath: qwpWritePath}) + if err != nil { + t.Fatalf("connect ws://%s: %v", qwpTestAddr, err) + } + defer tr.close() + + // Send a simple QWP message with delta symbol dict (required + // by the server for symbol columns) and verify the ACK. + tb := newQwpTableBuffer("qwp_transport_test") + col, _ := tb.getOrCreateColumn("value", qwpTypeLong, false) + col.addLong(42) + colTs, _ := tb.getOrCreateColumn("ts", qwpTypeTimestamp, false) + colTs.addTimestamp(1000000) + tb.commitRow() + + var enc qwpEncoder + msg := enc.encodeTable(tb) + + t.Logf("sending QWP message (%d bytes): %x", len(msg), msg) + + if err := tr.sendMessage(ctx, msg); err != nil { + t.Fatalf("sendMessage: %v", err) + } + + status, data, err := tr.readAck(ctx) + if err != nil { + t.Fatalf("readAck: %v", err) + } + + if status != QwpStatusOK { + errStr := parseAckError(data) + t.Logf("raw ACK response (%d bytes): %x", len(data), data) + t.Fatalf("expected OK, got status 0x%02X: %s", status, errStr) + } + t.Logf("ACK OK, sequence=%d", parseAckSequence(data)) +} diff --git a/qwp_max_batch_clamp_test.go b/qwp_max_batch_clamp_test.go new file mode 100644 index 00000000..4649b626 --- /dev/null +++ b/qwp_max_batch_clamp_test.go @@ -0,0 +1,864 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/coder/websocket" +) + +// newQwpTestServerWithMaxBatch returns a mock QWP server that +// advertises the supplied X-QWP-Max-Batch-Size in its upgrade +// response. A value <= 0 omits the header entirely (matches the +// older-server case the clamp must treat as "no cap"). +func newQwpTestServerWithMaxBatch(t *testing.T, maxBatchSize int) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + if maxBatchSize > 0 { + w.Header().Set(qwpHeaderMaxBatchSize, fmt.Sprintf("%d", maxBatchSize)) + } + conn, err := websocket.Accept(w, r, nil) + if err != nil { + t.Logf("websocket accept error: %v", err) + return + } + defer conn.CloseNow() + + var seq int64 + for { + _, _, err := conn.Read(context.Background()) + if err != nil { + return + } + conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) + seq++ + } + })) +} + +// TestQwpServerMaxBatchSizeParsed pins the raw header→transport +// plumbing: the parsed cap lands on qwpTransport.serverMaxBatchSize +// for any positive integer value, and stays at 0 when the header +// is absent or unparseable. +func TestQwpServerMaxBatchSizeParsed(t *testing.T) { + cases := []struct { + name string + header string // "" means do not send the header + expected int32 + }{ + {"absent", "", 0}, + {"positive_2mb", "2097152", 2 * 1024 * 1024}, + {"positive_16mb", "16777216", 16 * 1024 * 1024}, + {"zero", "0", 0}, + {"negative", "-1", 0}, + {"garbage", "not-a-number", 0}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + if tc.header != "" { + w.Header().Set(qwpHeaderMaxBatchSize, tc.header) + } + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + })) + defer srv.Close() + + s := newQwpSenderForTest(t, srv.URL) + defer s.Close(context.Background()) + tr := s.cursorSendLoop.transport.Load() + if tr == nil { + t.Fatalf("no transport bound after initial connect") + } + if tr.serverMaxBatchSize != tc.expected { + t.Fatalf("serverMaxBatchSize = %d, want %d (header=%q)", + tr.serverMaxBatchSize, tc.expected, tc.header) + } + }) + } +} + +// TestQwpApplyServerBatchSizeLimit exercises the clamp resolution +// table in isolation. Constructed without a real transport so each +// case can dial in autoFlushBytes + the synthetic cap directly, +// matching Java's applyServerBatchSizeLimit case analysis. +// +// Also pins that s.serverMaxBatchSize mirrors the transport's cap +// regardless of the opt-out / no-cap branches — the per-row hard +// guard and the flush-time defensive guard read this mirror, +// independent of the soft auto-flush trigger. +func TestQwpApplyServerBatchSizeLimit(t *testing.T) { + cases := []struct { + name string + autoFlushBytes int + serverCap int32 + expectEffective int64 + expectMirrorCap int32 + passNilTransport bool + }{ + // User opt-out wins for the auto-flush trigger; the raw cap + // still mirrors so the per-row hard guard fires. + {"optout_no_cap", 0, 0, 0, 0, false}, + {"optout_with_cap", 0, 1024 * 1024, 0, 1024 * 1024, false}, + // No server cap: configured value passes through; mirror is 0. + {"no_cap_keeps_configured", 8 << 20, 0, 8 << 20, 0, false}, + {"nil_transport_keeps_configured", 8 << 20, 0, 8 << 20, 0, true}, + // Configured below safe budget: configured wins. + {"configured_below_90pct", 1 << 20, 16 << 20, 1 << 20, 16 << 20, false}, + // Configured above safe budget: clamped to floor(cap*9/10). + {"clamp_to_90pct_of_16mb", 16 << 20, 16 << 20, int64(16<<20) * 9 / 10, 16 << 20, false}, + {"clamp_to_90pct_of_2mb", 8 << 20, 2 << 20, int64(2<<20) * 9 / 10, 2 << 20, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + s := &qwpLineSender{autoFlushBytes: tc.autoFlushBytes} + var tr *qwpTransport + if !tc.passNilTransport { + tr = &qwpTransport{serverMaxBatchSize: tc.serverCap} + } + s.applyServerBatchSizeLimit(tr) + if got := s.effectiveAutoFlushBytes.Load(); got != tc.expectEffective { + t.Fatalf("effectiveAutoFlushBytes = %d, want %d", got, tc.expectEffective) + } + if got := s.serverMaxBatchSize.Load(); got != tc.expectMirrorCap { + t.Fatalf("serverMaxBatchSize mirror = %d, want %d", got, tc.expectMirrorCap) + } + }) + } +} + +// TestQwpEffectiveAutoFlushBytesSeededOnConnect verifies that the +// end-to-end conf-driven path (LineSenderFromConf → memory mode) +// seeds the sender's effectiveAutoFlushBytes from the server's +// advertised cap on the initial connect, without relying on a +// follow-up reconnect. +func TestQwpEffectiveAutoFlushBytesSeededOnConnect(t *testing.T) { + // Advertise a 2 MiB cap — below the memory-mode segment cap + // (qwpSfDefaultMaxBytes, 4 MiB) so the server cap is unambiguously + // the binding term. Configured auto_flush_bytes default is 8 MiB + // (qwpDefaultAutoFlushBytes), so the clamp must reduce it to + // floor(2 MiB * 9/10). + const serverCap = 2 * 1024 * 1024 + srv := newQwpTestServerWithMaxBatch(t, serverCap) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + conf, err := confFromStr("ws::addr=" + addr + ";") + if err != nil { + t.Fatalf("confFromStr: %v", err) + } + if conf.autoFlushBytes != qwpDefaultAutoFlushBytes { + t.Fatalf("test precondition: autoFlushBytes default = %d, want %d", + conf.autoFlushBytes, qwpDefaultAutoFlushBytes) + } + + ls, err := LineSenderFromConf(context.Background(), "ws::addr="+addr+";") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + s, ok := ls.(*qwpLineSender) + if !ok { + t.Fatalf("LineSenderFromConf returned %T, want *qwpLineSender", ls) + } + got := s.effectiveAutoFlushBytes.Load() + want := int64(serverCap) * 9 / 10 + if got != want { + t.Fatalf("effectiveAutoFlushBytes = %d, want %d (90%% of %d)", + got, want, serverCap) + } +} + +// TestQwpEffectiveAutoFlushBytesClampedToSegmentWhenServerHasNoCap +// pins the "older server" case: when the upgrade response omits +// X-QWP-Max-Batch-Size, the per-segment frame cap is the binding +// floor. The configured 8 MiB default would otherwise let a batch +// grow past the 4 MiB memory-mode segment and wedge on flush, so the +// trigger is clamped to floor(maxFrameBytes * 9/10) regardless of the +// (absent) server cap. +func TestQwpEffectiveAutoFlushBytesClampedToSegmentWhenServerHasNoCap(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 0) // header omitted + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), "ws::addr="+addr+";") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + s := ls.(*qwpLineSender) + maxFrame := int64(qwpSfDefaultMaxBytes) - qwpSfHeaderSize - qwpSfFrameHeaderSize + want := maxFrame * 9 / 10 + if got := s.effectiveAutoFlushBytes.Load(); got != want { + t.Fatalf("effectiveAutoFlushBytes = %d, want %d (segment clamp with server cap unset)", + got, want) + } +} + +// TestQwpEffectiveAutoFlushBytesKeptWhenBelowSegmentCap pins that a +// configured byte trigger comfortably under the segment cap flows +// through unchanged: the segment clamp only ever reduces, never +// inflates. Companion to the segment-clamp floor test above. +func TestQwpEffectiveAutoFlushBytesKeptWhenBelowSegmentCap(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 0) // no server cap + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + // 1 MiB is well under both the 4 MiB segment and its 90% clamp. + const configured = 1 * 1024 * 1024 + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=1m;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + s := ls.(*qwpLineSender) + if got := s.effectiveAutoFlushBytes.Load(); got != int64(configured) { + t.Fatalf("effectiveAutoFlushBytes = %d, want %d (configured, below segment cap)", + got, configured) + } +} + +// TestQwpEffectiveAutoFlushBytesPreservesOptout pins that +// auto_flush_bytes=off survives a server cap advertisement: the +// user's explicit opt-out wins. +func TestQwpEffectiveAutoFlushBytesPreservesOptout(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 4*1024*1024) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + s := ls.(*qwpLineSender) + if got := s.effectiveAutoFlushBytes.Load(); got != 0 { + t.Fatalf("effectiveAutoFlushBytes = %d, want 0 (auto_flush_bytes=off)", + got) + } +} + +// TestQwpSwapClientFiresOnTransportSwap pins the swap → callback +// wire: invoking swapClient with a fresh transport runs the +// installed onTransportSwap callback, passing the freshly bound +// transport. This is the seam the sender relies on to re-apply +// the auto_flush_bytes clamp after every reconnect, so a +// regression that severed the wire would silently strand the +// clamp at its initial value across a rolling-upgrade boundary. +func TestQwpSwapClientFiresOnTransportSwap(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 4*1024*1024) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + if err != nil { + t.Fatalf("qwpSfNewCursorEngine: %v", err) + } + defer func() { _ = engine.engineClose() }() + + dial := func(ctx context.Context, _ int) (*qwpTransport, error) { + var tr qwpTransport + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + if err := tr.connect(ctx, wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { + return nil, err + } + return &tr, nil + } + + initial, err := dial(context.Background(), 0) + if err != nil { + t.Fatalf("initial dial: %v", err) + } + loop := qwpSfNewSendLoop(engine, initial, dial, + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + defer func() { _ = loop.sendLoopClose() }() + + // Don't start the loop — swapClient is callable independently + // of the run() loop, and avoiding sendLoopStart keeps the test + // deterministic (no reconnect-machinery races). + var ( + fired int + lastCapArg int32 + ) + loop.sendLoopSetOnTransportSwap(func(t *qwpTransport) { + fired++ + if t != nil { + lastCapArg = t.serverMaxBatchSize + } + }) + + // Swap to a fresh transport (re-dialled against the same + // server, so the same cap should re-arrive). + replacement, err := dial(context.Background(), 0) + if err != nil { + t.Fatalf("replacement dial: %v", err) + } + if err := loop.swapClient(replacement); err != nil { + t.Fatalf("swapClient: %v", err) + } + if fired != 1 { + t.Fatalf("onTransportSwap fired %d times, want 1", fired) + } + if lastCapArg != 4*1024*1024 { + t.Fatalf("callback saw cap=%d, want %d", lastCapArg, 4*1024*1024) + } + + // Second swap also fires the callback — the wire is not + // one-shot. + replacement2, err := dial(context.Background(), 0) + if err != nil { + t.Fatalf("second replacement dial: %v", err) + } + if err := loop.swapClient(replacement2); err != nil { + t.Fatalf("second swapClient: %v", err) + } + if fired != 2 { + t.Fatalf("onTransportSwap fired %d times after second swap, want 2", fired) + } + + // Clearing the callback turns off the wire. + loop.sendLoopSetOnTransportSwap(nil) + replacement3, err := dial(context.Background(), 0) + if err != nil { + t.Fatalf("third replacement dial: %v", err) + } + if err := loop.swapClient(replacement3); err != nil { + t.Fatalf("third swapClient: %v", err) + } + if fired != 2 { + t.Fatalf("onTransportSwap fired %d times after clear, want 2", fired) + } +} + +// TestQwpPerRowGuardFires verifies the per-row hard guard catches a +// single row whose buffered bytes already exceed the server's wire +// cap, before commitRow makes the row visible to the batch. Uses +// auto_flush_bytes=off so the soft trigger does not race the guard. +func TestQwpPerRowGuardFires(t *testing.T) { + // 64 bytes cap — any non-trivial row trips it. + srv := newQwpTestServerWithMaxBatch(t, 64) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + // 200-byte string column alone exceeds the 64-byte cap. + err = ls.Table("t"). + StringColumn("big", strings.Repeat("x", 200)). + AtNow(context.Background()) + if err == nil { + t.Fatal("expected per-row guard to fire, got nil error") + } + if !strings.Contains(err.Error(), "row too large for server batch cap") { + t.Fatalf("error = %q, want substring %q", err.Error(), + "row too large for server batch cap") + } + if !strings.Contains(err.Error(), "serverMaxBatchSize=64") { + t.Fatalf("error = %q, want it to name the cap", err.Error()) + } + + // Sender stays usable: the failed row's bytes were discarded + // via cancelRow, and the next Table() call starts a clean row. + // We can't easily flush anything meaningful through a 64-byte + // cap, so just check that the sender does not latch an error. + s := ls.(*qwpLineSender) + if s.pendingRowCount != 0 { + t.Fatalf("pendingRowCount = %d after guard fire, want 0", + s.pendingRowCount) + } +} + +// TestQwpPerRowGuardPreservesPriorCommittedRows verifies the per-row +// guard rolls back ONLY the offending row — earlier rows in the +// batch stay intact and remain flushable. This is the property that +// makes the guard recoverable instead of catastrophic. +func TestQwpPerRowGuardPreservesPriorCommittedRows(t *testing.T) { + // 1024 bytes cap: small rows fit, a 2000-byte string does not. + srv := newQwpTestServerWithMaxBatch(t, 1024) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + ctx := context.Background() + // Two small rows commit cleanly. + for i := 0; i < 2; i++ { + if err := ls.Table("t"). + Symbol("s", "a"). + Int64Column("x", int64(i)). + AtNow(ctx); err != nil { + t.Fatalf("AtNow[%d]: %v", i, err) + } + } + + // Third row is oversize; guard fires. + err = ls.Table("t"). + StringColumn("big", strings.Repeat("x", 2000)). + AtNow(ctx) + if err == nil { + t.Fatal("expected per-row guard to fire, got nil error") + } + if !strings.Contains(err.Error(), "row too large for server batch cap") { + t.Fatalf("error = %q, want guard-fire substring", err.Error()) + } + + // The two earlier rows are still pending; flush succeeds (the + // encoded frame for two small rows + schema stays under 1024). + s := ls.(*qwpLineSender) + if s.pendingRowCount != 2 { + t.Fatalf("pendingRowCount = %d after guard fire, want 2 (prior rows preserved)", + s.pendingRowCount) + } + if err := ls.Flush(ctx); err != nil { + t.Fatalf("Flush of prior rows: %v", err) + } + if s.pendingRowCount != 0 { + t.Fatalf("pendingRowCount = %d after flush, want 0", s.pendingRowCount) + } +} + +// TestQwpPerRowGuardClearsCachedDesignatedTs is a regression test +// for a silent data-loss bug: when the per-row guard fires on the +// FIRST row of a fresh table, cancelRow removes the just-created +// designated-TS column from tb.columns (committedColumnCount is 0, +// so all uncommitted columns get wiped). But s.cachedDesignatedTs +// still holds a pointer to that now-orphaned column, with col.table +// still pointing at the same tb. On the user's retry, the cache +// staleness check at atWithTimestamp passes (col.table matches, +// typeCode matches), the orphan is reused, addTimestamp writes +// into a column that is no longer in tb.columns, and commitRow +// (which only iterates tb.columns) commits the row without a +// designated timestamp. The encoder then ships the row on the wire +// with NO designated-TS column. The fix is to nil out +// s.cachedDesignatedTs on every cancelRow path in atWithTimestamp. +func TestQwpPerRowGuardClearsCachedDesignatedTs(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 64) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + ctx := context.Background() + + // First attempt on a fresh table: a 200-byte string trips the + // per-row guard. cancelRow wipes both the string column and the + // designated-TS column from tb.columns because committedColumnCount + // is 0. s.cachedDesignatedTs is left pointing at the orphaned + // "" column. + err = ls.Table("t"). + StringColumn("big", strings.Repeat("x", 200)). + At(ctx, time.Unix(0, 1_000_000_000)) + if err == nil { + t.Fatal("expected per-row guard to fire, got nil error") + } + if !strings.Contains(err.Error(), "row too large for server batch cap") { + t.Fatalf("error = %q, want guard-fire substring", err.Error()) + } + + // Retry on the same table with a small row + explicit At(ts). + // If cachedDesignatedTs was cleared, getOrCreateDesignatedTimestamp + // runs and re-creates the "" column in tb.columns. If not, the + // staleness check skips the lookup, the orphan is reused, and + // commitRow runs without a "" column in tb.columns. + if err := ls.Table("t"). + Symbol("s", "a"). + At(ctx, time.Unix(0, 2_000_000_000)); err != nil { + t.Fatalf("retry At: %v", err) + } + + s := ls.(*qwpLineSender) + tb, ok := s.tableBuffers["t"] + if !ok || tb == nil { + t.Fatal("table buffer for 't' missing after retry") + } + + // The designated-TS column lives under the empty-string key. + // If the bug is present, cancelRow removed it from columnIndex + // and the cached-orphan reuse meant nothing re-added it. + if _, ok := tb.columnIndex[""]; !ok { + names := make([]string, 0, len(tb.columns)) + for _, c := range tb.columns { + names = append(names, c.name) + } + t.Fatalf("designated-TS column missing from columnIndex after retry; tb.columns=%v", names) + } + var dtCol *qwpColumnBuffer + for _, c := range tb.columns { + if c.name == "" { + dtCol = c + break + } + } + if dtCol == nil { + names := make([]string, 0, len(tb.columns)) + for _, c := range tb.columns { + names = append(names, c.name) + } + t.Fatalf("designated-TS column not present in tb.columns after retry; tb.columns=%v", names) + } + // The retry committed exactly one row; the designated-TS column + // must reflect that row in its data (i.e., be encoded with the + // rest of the table). + if dtCol.rowCount != 1 { + t.Fatalf("designated-TS column rowCount = %d, want 1 (retry row committed with timestamp)", + dtCol.rowCount) + } +} + +// TestQwpPerRowGuardNoOpWhenServerHasNoCap pins the older-server +// path: when the upgrade response omits X-QWP-Max-Batch-Size, the +// per-row guard short-circuits and an arbitrarily large row commits +// without complaint. Important so an older server in a rolling +// upgrade doesn't suddenly start rejecting rows the client was +// happily sending before. +func TestQwpPerRowGuardNoOpWhenServerHasNoCap(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 0) // header omitted + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + // Append a row with a moderately large string. Would trip the + // guard against any reasonable cap, but here the server + // advertised none. + if err := ls.Table("t"). + StringColumn("big", strings.Repeat("x", 10_000)). + AtNow(context.Background()); err != nil { + t.Fatalf("AtNow with large string and no advertised cap: %v", err) + } + s := ls.(*qwpLineSender) + if s.pendingRowCount != 1 { + t.Fatalf("pendingRowCount = %d, want 1", s.pendingRowCount) + } +} + +// TestQwpPerRowGuardMidRowCapTransition is a regression test for the +// false-reject path on async-initial-connect senders that opted out +// of both byte-size triggers (autoFlushBytes=0, maxBufSize=0). The +// Table()-entry snapshot of currentTableBytesBefore was previously +// gated on at least one of {maxBufSize, autoFlushBytes, +// serverMaxBatchSize} being non-zero. If the user opted out of the +// first two and the initial connect was still pending, all three +// gate inputs were zero, so the snapshot was skipped. If +// serverMaxBatchSize then flipped from 0 to positive between Table() +// and At() (the I/O goroutine's onTransportSwap callback firing +// mid-row), the per-row guard ran with a stale baseline of 0 and +// computed rowBytes = tb.dataSize - 0 = bytes of every committed +// row in the buffer, not the in-progress row alone. A valid row +// whose true delta fit under the cap was rejected as "row too +// large". Fix is to always snapshot at Table() entry; this test +// fires the mid-row transition deterministically by flipping +// serverMaxBatchSize directly. +func TestQwpPerRowGuardMidRowCapTransition(t *testing.T) { + ctx := context.Background() + ts := time.Unix(0, 1_000_000_000) + + s := &qwpLineSender{ + tableBuffers: make(map[string]*qwpTableBuffer), + globalSymbols: make(map[string]int32), + maxSentSymbolId: -1, + batchMaxSymbolId: -1, + // autoFlushBytes / maxBufSize both 0 — user opted out of + // both byte-size triggers. + } + + // Row 1 commits while serverMaxBatchSize is still 0 (initial + // connect not yet completed). Use a moderately large string so + // row 1 alone occupies enough buffer bytes that a small cap can + // distinguish "row 2 alone" from "row 1 + row 2". + if err := s.Table("t"). + Symbol("s", "a"). + StringColumn("big", strings.Repeat("x", 500)). + At(ctx, ts); err != nil { + t.Fatalf("row 1 At: %v", err) + } + if s.pendingRowCount != 1 { + t.Fatalf("after row 1: pendingRowCount = %d, want 1", s.pendingRowCount) + } + tb := s.tableBuffers["t"] + row1Bytes := tb.approxDataSize() + if row1Bytes < 500 { + t.Fatalf("row 1 buffered bytes = %d, want >= 500", row1Bytes) + } + + // Row 2: open the row first while cap is still 0, then flip the + // cap to a value that's well under the cumulative buffer total + // but well above any plausible per-row-2 delta. The mid-row + // flip simulates the async-initial-connect onTransportSwap + // callback racing the producer. + s.Table("t").Symbol("s", "b").Int64Column("v", int64(42)) + + // Pick a cap that's a safe margin above row 2's true delta but + // strictly below the cumulative buffer size. 200 bytes comfortably + // fits row 2's symbol+int delta (well under 100 bytes) while + // staying under row1Bytes (>= 500). + const cap = int32(200) + if int(cap) >= row1Bytes { + t.Fatalf("test setup: cap %d must be < row1Bytes %d", cap, row1Bytes) + } + s.serverMaxBatchSize.Store(cap) + + // With the fix in place, the snapshot taken at Table() entry + // reflects the post-row-1 buffer size, so the per-row guard + // computes rowBytes against just row 2's delta and passes. + // With the pre-fix gate, currentTableBytesBefore stayed at 0 + // and the guard would compute rowBytes = full buffer total and + // falsely reject. + if err := s.At(ctx, ts.Add(time.Microsecond)); err != nil { + t.Fatalf("row 2 At (mid-row cap transition): %v", err) + } + if s.pendingRowCount != 2 { + t.Fatalf("after row 2: pendingRowCount = %d, want 2 (row 2 should have committed)", + s.pendingRowCount) + } +} + +// TestQwpFlushTimeGuardFires verifies the defensive cap check at +// encode time catches the case where individual rows fit under the +// cap but their cumulative encoded frame (schema, dict, headers, +// row data) does not. Drops all pending state in-place and surfaces +// a typed error naming the size, cap, and dropped-row count. +func TestQwpFlushTimeGuardFires(t *testing.T) { + // Small cap; many small rows; auto-flush off so we control + // exactly when the flush triggers. + const serverCap = 256 + srv := newQwpTestServerWithMaxBatch(t, serverCap) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + ctx := context.Background() + const rows = 100 + for i := 0; i < rows; i++ { + if err := ls.Table("t"). + Symbol("s", "abc"). + Int64Column("x", int64(i)). + AtNow(ctx); err != nil { + t.Fatalf("AtNow[%d]: %v", i, err) + } + } + s := ls.(*qwpLineSender) + if s.pendingRowCount != rows { + t.Fatalf("pendingRowCount before flush = %d, want %d (per-row guard misfire?)", + s.pendingRowCount, rows) + } + + err = ls.Flush(ctx) + if err == nil { + t.Fatalf("expected flush-time defensive guard to fire, got nil error") + } + if !strings.Contains(err.Error(), "batch too large for server batch cap") { + t.Fatalf("error = %q, want guard-fire substring", err.Error()) + } + wantDroppedSub := fmt.Sprintf("droppedRows=%d", rows) + if !strings.Contains(err.Error(), wantDroppedSub) { + t.Fatalf("error = %q, want %q substring", err.Error(), wantDroppedSub) + } + if !strings.Contains(err.Error(), fmt.Sprintf("serverMaxBatchSize=%d", serverCap)) { + t.Fatalf("error = %q, want serverMaxBatchSize=%d substring", err.Error(), serverCap) + } +} + +// TestQwpFlushTimeGuardResetsPendingState verifies the sender is +// usable after the defensive guard fires: pendingRowCount returns +// to 0, table buffers are cleared, and a subsequent small flush +// goes through cleanly. +func TestQwpFlushTimeGuardResetsPendingState(t *testing.T) { + const serverCap = 256 + srv := newQwpTestServerWithMaxBatch(t, serverCap) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + ctx := context.Background() + for i := 0; i < 100; i++ { + if err := ls.Table("t"). + Symbol("s", "abc"). + Int64Column("x", int64(i)). + AtNow(ctx); err != nil { + t.Fatalf("AtNow[%d]: %v", i, err) + } + } + if err := ls.Flush(ctx); err == nil { + t.Fatal("expected flush-time guard to fire") + } + + s := ls.(*qwpLineSender) + if s.pendingRowCount != 0 { + t.Fatalf("pendingRowCount = %d after guard fire, want 0", s.pendingRowCount) + } + if s.pendingBytes != 0 { + t.Fatalf("pendingBytes = %d after guard fire, want 0", s.pendingBytes) + } + + // Sender should still accept new rows. The encoded frame for + // a single small row fits under the cap. + if err := ls.Table("t"). + Int64Column("x", 1). + AtNow(ctx); err != nil { + t.Fatalf("AtNow after guard reset: %v", err) + } + if err := ls.Flush(ctx); err != nil { + t.Fatalf("Flush of single row after reset: %v", err) + } +} + +// TestQwpFlushTimeGuardNoOpWhenServerHasNoCap is the +// flush-time equivalent of TestQwpPerRowGuardNoOpWhenServerHasNoCap: +// no advertised cap means the encoder's output flows straight to +// engineAppendBlocking, regardless of how large the encoded frame +// is. +func TestQwpFlushTimeGuardNoOpWhenServerHasNoCap(t *testing.T) { + srv := newQwpTestServerWithMaxBatch(t, 0) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush_bytes=off;") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + ctx := context.Background() + for i := 0; i < 100; i++ { + if err := ls.Table("t"). + Symbol("s", "abc"). + Int64Column("x", int64(i)). + AtNow(ctx); err != nil { + t.Fatalf("AtNow[%d]: %v", i, err) + } + } + if err := ls.Flush(ctx); err != nil { + t.Fatalf("Flush with no advertised cap: %v", err) + } +} + +// TestQwpClampDrivesAutoFlushTrigger demonstrates the end-to-end +// behavior change: with auto_flush_bytes configured to 8 MiB and +// the server advertising a 256 KiB cap, the per-row trigger fires +// at the clamped threshold (~230 KiB) instead of the configured +// 8 MiB. Verified by counting flushes after writing enough bytes +// to cross the clamped threshold but stay under the configured +// one. +func TestQwpClampDrivesAutoFlushTrigger(t *testing.T) { + // Force a tiny server cap so a small number of rows crosses + // the clamped threshold within a reasonable test runtime. + const serverCap = 256 * 1024 // 256 KiB + srv := newQwpTestServerWithMaxBatch(t, serverCap) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + // Configured value (8 MiB default) is well above the clamp, + // so any flush observed here is the clamp's doing. + ls, err := LineSenderFromConf(context.Background(), "ws::addr="+addr+";") + if err != nil { + t.Fatalf("LineSenderFromConf: %v", err) + } + defer ls.Close(context.Background()) + + s := ls.(*qwpLineSender) + wantThreshold := int64(serverCap) * 9 / 10 + if got := s.effectiveAutoFlushBytes.Load(); got != wantThreshold { + t.Fatalf("effectiveAutoFlushBytes = %d, want %d", got, wantThreshold) + } + + // Write rows until pendingBytes would cross the clamped + // threshold. Each row is small (under 1 KiB once encoded), so + // the trigger fires partway through the loop. After auto-flush, + // pendingRowCount resets to 0; we assert it did at least once. + ctx := context.Background() + flushed := false + for i := 0; i < 4096 && !flushed; i++ { + if err := s.Table("clamp_test"). + Symbol("host", "h1"). + Int64Column("v", int64(i)). + At(ctx, time.Unix(0, int64(i+1)*1_000_000)); err != nil { + t.Fatalf("At[%d]: %v", i, err) + } + if s.pendingRowCount == 0 { + // auto-flush triggered and reset state. + flushed = true + } + } + if !flushed { + t.Fatalf("auto-flush never triggered: pendingBytes=%d, threshold=%d", + s.pendingBytes, wantThreshold) + } +} diff --git a/qwp_query_batch.go b/qwp_query_batch.go new file mode 100644 index 00000000..040fe5ab --- /dev/null +++ b/qwp_query_batch.go @@ -0,0 +1,1226 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "fmt" + "math" + "slices" + "unsafe" +) + +// qwpColumnSchemaInfo captures the per-column metadata carried in the +// schema section of a RESULT_BATCH frame. One instance per column; +// the decoder parses it from the first batch of a query (batch_seq == +// 0) and reuses it across that query's continuation batches. +// +// Named with the "Schema" infix to avoid colliding with the +// `qwpColumnInfo` struct already defined in `qwp_integration_test.go` +// (which is the JSON shape returned by QuestDB's /exec endpoint). +type qwpColumnSchemaInfo struct { + name string + wireType qwpTypeCode +} + +// qwpSymbolEntry points to one entry in a connection-scoped symbol +// dictionary: (offset, length) into the heap, packed into two uint32s +// so the aggregate entries slice has predictable per-element size. +type qwpSymbolEntry struct { + offset uint32 + length uint32 +} + +// qwpSymbolDictView is a snapshot view over the decoder's connection- +// scoped symbol dictionary. The underlying heap is append-only, so a +// snapshot taken at the time a column is decoded remains valid after +// subsequent batches extend the dict. The view is two slice headers +// whose lengths are frozen at snapshot time; len(entries) is the +// snapshot's dictionary size. +type qwpSymbolDictView struct { + heap []byte + entries []qwpSymbolEntry +} + +// qwpColumnLayout is the per-column parsed state for one RESULT_BATCH. +// All slice fields alias into the frame's payload (the WebSocket recv +// buffer) except `timestampBuf`, which the decoder owns because the +// Gorilla-decoded int64 values cannot be produced in-place. +// +// Lifetime: layouts live on the enclosing QwpColumnBatch and are +// grown in place by the decoder across decodes into the SAME batch. +// Two QwpBatchBuffers that the I/O goroutine alternates between +// therefore never share layout storage, so emitting batch N while +// decoding batch N+1 does not corrupt N's view. `clear` nil-s the +// slice headers but preserves backing arrays on the non-aliasing +// fields (`nonNullIdx`, `symbolRowIds`, `timestampBuf`, +// `arrayRowStart`, `arrayElems`), so subsequent decodes into the +// same batch with the same column width avoid reallocation. +type qwpColumnLayout struct { + info *qwpColumnSchemaInfo + + // scale is the decimal scale for DECIMAL64/128/256 columns. Read + // from the DATA section per batch; zero for non-decimal columns. + // Stored per-layout (not on the shared qwpColumnSchemaInfo) so the + // decoder's write is exclusive to the dispatcher's per-batch + // storage — the consumer reads its own batch's layout, which + // cannot be mutated concurrently. + scale uint8 + + // precisionBits is the precision in bits for GEOHASH columns. Read + // from the DATA section per batch; zero for non-GEOHASH columns. + // See `scale` for the per-layout placement rationale. + precisionBits uint16 + + // null bitmap (LSB-first; 1 = NULL). Nil when the column has no + // nulls in this batch; the decoder skips allocating `nonNullIdx` + // on this branch and typed accessors fall back to identity indexing. + nullBitmap []byte + + // Count of non-null rows in this batch (== rowCount when nullBitmap + // is nil, else rowCount - popcount(nullBitmap)). + nonNullCount int + + // Dense index per row: nonNullIdx[row] is the position of row + // within the non-null values region, or -1 if the row is NULL. + // Nil when nullBitmap == nil (identity mapping row → row). + nonNullIdx []int32 + + // Dense values region. For fixed-width types this is nonNullCount * + // sizeBytes bytes of packed values. For STRING/VARCHAR/BINARY this + // is the (nonNullCount+1)*4-byte offsets array; the concatenated + // bytes live in `stringBytes`. For SYMBOL/ARRAY this is left nil + // and per-cell readers go through `symbolRowIds` / arrayRow*. + // For Gorilla TIMESTAMP, this aliases `timestampBuf` reinterpreted + // as bytes so the Int64 accessor path stays uniform. + values []byte + + // Concatenated UTF-8 / opaque byte region for STRING/VARCHAR/BINARY. + stringBytes []byte + + // Per-row symbol dictionary id (size rowCount; NULL rows hold + // undefined values — callers must null-check first). + symbolRowIds []int32 + + // Connection-scoped dictionary snapshot for this column's SYMBOL + // values. Zero value (nil heap) for non-SYMBOL columns. + symbolDict qwpSymbolDictView + + // arrayRowStart is the byte offset within `values` where each + // array row's nDims byte begins. Size rowCount; NULL rows hold 0. + arrayRowStart []int32 + // arrayElems is the precomputed element count for each array row, + // cached at decode time so per-cell accessors avoid re-walking the + // shape header. Bounded by qwpMaxArrayElements (fits in int32). + // Size rowCount; NULL rows hold 0. + arrayElems []int32 + + // Decoder-owned decode buffer for Gorilla-encoded TIMESTAMP columns. + // Sized to nonNullCount; `values` aliases this as bytes. + timestampBuf []int64 +} + +// clear resets the layout between reuses. Backing arrays on the +// non-aliasing fields are kept so the decoder amortises allocation +// across batches of the same column width. +func (l *qwpColumnLayout) clear() { + l.info = nil + l.scale = 0 + l.precisionBits = 0 + l.nullBitmap = nil + l.nonNullCount = 0 + l.nonNullIdx = l.nonNullIdx[:0] + l.values = nil + l.stringBytes = nil + l.symbolRowIds = l.symbolRowIds[:0] + l.symbolDict = qwpSymbolDictView{} + l.arrayRowStart = l.arrayRowStart[:0] + l.arrayElems = l.arrayElems[:0] + l.timestampBuf = l.timestampBuf[:0] +} + +// denseIndex maps a logical row index to the dense-values index. The +// typed accessors call this after a null-check to find the byte offset +// of a non-null value in `values`. +func (l *qwpColumnLayout) denseIndex(row int) int { + if l.nullBitmap == nil { + return row + } + return int(l.nonNullIdx[row]) +} + +// isNull reports whether the cell at `row` is NULL in this batch. +func (l *qwpColumnLayout) isNull(row int) bool { + if l.nullBitmap == nil { + return false + } + b := l.nullBitmap[row>>3] + return b&(1<<(row&7)) != 0 +} + +// requireFixedWidth panics with a typed message when the column's wire +// type is not a fixed-width type of exactly `size` bytes. The bulk +// *Range accessors call this once per range — amortized over the row +// span — so a mis-typed call fails with a clear message instead of an +// opaque slice-bounds panic deep in the memmove path (e.g. Int64Range +// on a bit-packed BOOLEAN column, whose dense region is far shorter than +// toRow*8). Same-width reinterpretation is intentionally permitted: +// Int64Range on a DOUBLE column passes the guard (both are 8-byte) and +// yields the raw bits decoded as the target type. +func (l *qwpColumnLayout) requireFixedWidth(method string, size int) { + if qwpFixedTypeSize(l.info.wireType) != size { + panic(fmt.Sprintf("%s: column %q is %s, not a fixed-width %d-byte type", + method, l.info.name, qwpTypeName(l.info.wireType), size)) + } +} + +// requireArray panics with a typed message when the column is not an +// array type. The array accessors index arrayRowStart / arrayElems, +// which the decoder populates only for DOUBLE_ARRAY / LONG_ARRAY +// columns (clear() rewinds them to :0), so without this guard a +// mis-typed call panics with an opaque "index out of range [n] with +// length 0" from arrayRowStart. One byte comparison, amortized against +// the per-call shape walk / allocation. +func (l *qwpColumnLayout) requireArray(method string) { + if !qwpIsArrayType(l.info.wireType) { + panic(fmt.Sprintf("%s: column %q is %s, not an array type", + method, l.info.name, qwpTypeName(l.info.wireType))) + } +} + +// QwpColumnBatch is a column-major view over one decoded RESULT_BATCH +// frame. The batch is valid only for the duration of the current +// iteration of a *QwpQuery's `Batches()` range — its accessors return +// slice views that alias the underlying WebSocket recv buffer. Do not +// retain any returned slice or string beyond the loop iteration; use +// `CopyAll` (once implemented in the I/O-goroutine slab) if you need +// persistent copies. +// +// All typed accessors are safe to call on NULL rows: they return the +// zero value of their return type (0, false, "", nil). Use `IsNull` +// first if you need to distinguish. +type QwpColumnBatch struct { + payload []byte + requestId int64 + batchSeq int64 + rowCount int + columnCount int + columns []qwpColumnSchemaInfo // alias into the current query's schema + layouts []qwpColumnLayout // one per column; pool-owned + + // zstdScratch holds the decompressed body when the owning + // RESULT_BATCH carried FLAG_ZSTD. The decoder grows it to match + // the frame's content size and reuses the backing array across + // decodes into the SAME batch. The layout byte-slices alias into + // this buffer when the batch is compressed; `payload` is pointed + // at it by the decoder so the existing aliasing invariants hold + // without duplicating state. + // + // Lives on the batch (not on the decoder) for the same reason the + // layout pool does: two qwpBatchBuffers that the I/O goroutine + // alternates between must not share scratch storage, else + // decoding batch N+1 would clobber batch N's view. + zstdScratch []byte +} + +// Payload returns the raw frame payload that backs this batch. Exposed +// for byte-counting / metrics — callers must not mutate or retain it. +func (b *QwpColumnBatch) Payload() []byte { return b.payload } + +// RequestId returns the client-assigned 64-bit id from the originating +// QUERY_REQUEST. All frames for one query share the same id. +func (b *QwpColumnBatch) RequestId() int64 { return b.requestId } + +// BatchSeq returns the monotonic per-request sequence number (starts at +// 0 for the first batch of a query, increments by 1 per RESULT_BATCH). +func (b *QwpColumnBatch) BatchSeq() int64 { return b.batchSeq } + +// RowCount returns the number of rows in this batch. +func (b *QwpColumnBatch) RowCount() int { return b.rowCount } + +// ColumnCount returns the number of columns. +func (b *QwpColumnBatch) ColumnCount() int { return b.columnCount } + +// ColumnName returns the server-reported column name. +func (b *QwpColumnBatch) ColumnName(col int) string { return b.columns[col].name } + +// ColumnType returns the wire-type byte for the column (one of the +// `QwpType*` constants, e.g. QwpTypeInt for INT). Callers dispatch on +// this to pick the right typed accessor. +func (b *QwpColumnBatch) ColumnType(col int) byte { return byte(b.columns[col].wireType) } + +// DecimalScale returns the decimal scale for DECIMAL64/128/256 columns. +// Not meaningful for other types; returns 0. +func (b *QwpColumnBatch) DecimalScale(col int) int { return int(b.layouts[col].scale) } + +// GeohashPrecisionBits returns the precision in bits for a GEOHASH +// column. Not meaningful for other types; returns 0. +func (b *QwpColumnBatch) GeohashPrecisionBits(col int) int { + return int(b.layouts[col].precisionBits) +} + +// IsNull reports whether the cell at (col, row) is NULL in this batch. +// +// Note: QuestDB uses sentinel values for several primitive types (e.g. +// Long.MinValue for LONG, NaN for FLOAT/DOUBLE, -1 for GEOHASH). Those +// rows also return true from IsNull — the server encodes them via the +// null bitmap, so "real NaN" and "explicit NULL" are indistinguishable +// over the wire. +func (b *QwpColumnBatch) IsNull(col, row int) bool { + return b.layouts[col].isNull(row) +} + +// NonNullCount returns the count of non-null rows in a column — +// i.e. the size of the dense values region before row-level dispatch. +func (b *QwpColumnBatch) NonNullCount(col int) int { + return b.layouts[col].nonNullCount +} + +// --- Fixed-width typed accessors --- +// +// Each accessor assumes the caller knows the column's wire type. Call +// ColumnType(col) for generic dispatch; in a schema-aware query runner +// the caller already knows. NULL rows return the zero value of the +// accessor's return type. +// +// These per-cell accessors do not validate the wire type — that check +// stays off the hot path. A mis-typed per-cell call is undefined: +// depending on the column's element width it either reinterprets the +// underlying bytes (an 8-byte DOUBLE read through Int64 yields numeric +// noise) or panics with an out-of-range index (Int64 on a 1-byte BYTE +// or a bit-packed BOOLEAN slices past the dense region). The bulk +// *Range and array accessors DO guard — there the check amortizes over +// the call — and panic with a typed message; see their contract notes. +// +// The QwpColumn handle (`Column(col)`) duplicates each accessor body. +// Routing the batch surface through `b.Column(col).X(row)` would halve +// the maintenance surface but ~doubles per-cell latency on Go 1.26 — +// the inliner does not flatten the by-value receiver chain, so the +// QwpColumn struct construction stays on the hot path. When adding a +// new wire type, mirror it on both surfaces. + +// Bool returns the BOOLEAN value at (col, row). BOOLEAN is bit-packed +// on the wire: 8 non-null values per byte, LSB-first. +func (b *QwpColumnBatch) Bool(col, row int) bool { + l := &b.layouts[col] + if l.isNull(row) { + return false + } + idx := l.denseIndex(row) + return l.values[idx>>3]&(1<<(idx&7)) != 0 +} + +// Int8 returns the BYTE value at (col, row). +func (b *QwpColumnBatch) Int8(col, row int) int8 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + return int8(l.values[l.denseIndex(row)]) +} + +// Int16 returns the SHORT value at (col, row). +func (b *QwpColumnBatch) Int16(col, row int) int16 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 2 + return int16(binary.LittleEndian.Uint16(l.values[i : i+2])) +} + +// Char returns the CHAR value at (col, row) as a rune. The wire format +// stores CHAR as a 2-byte UTF-16 code unit — code points outside the +// BMP are not representable and the encoder refuses to emit them, so +// the returned value always fits in a uint16. +func (b *QwpColumnBatch) Char(col, row int) rune { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 2 + return rune(binary.LittleEndian.Uint16(l.values[i : i+2])) +} + +// Int32 returns the INT or IPv4 value at (col, row). Both share the +// 4-byte LE wire layout; IPv4's four octets are the four bytes of the +// int32 in network-independent little-endian order. +func (b *QwpColumnBatch) Int32(col, row int) int32 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 4 + return int32(binary.LittleEndian.Uint32(l.values[i : i+4])) +} + +// Int64 returns an 8-byte column value at (col, row). Applicable to +// LONG, DATE, TIMESTAMP, TIMESTAMP_NANOS, and DECIMAL64 columns — +// they all share the int64 LE wire format. +func (b *QwpColumnBatch) Int64(col, row int) int64 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// Float32 returns the FLOAT value at (col, row). NULL rows return 0, +// NOT NaN — callers who want to distinguish explicit NaN from NULL +// must check IsNull first (see the note on IsNull about QuestDB's +// sentinel-based null encoding). +func (b *QwpColumnBatch) Float32(col, row int) float32 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 4 + return math.Float32frombits(binary.LittleEndian.Uint32(l.values[i : i+4])) +} + +// Float64 returns the DOUBLE value at (col, row). +func (b *QwpColumnBatch) Float64(col, row int) float64 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 8 + return math.Float64frombits(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// --- Wide fixed-width: UUID, LONG256, DECIMAL128, DECIMAL256 --- + +// UuidLo returns the low 64 bits of a UUID (byte offset 0 within the +// 16-byte cell). +func (b *QwpColumnBatch) UuidLo(col, row int) int64 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 16 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// UuidHi returns the high 64 bits of a UUID (byte offset 8). +func (b *QwpColumnBatch) UuidHi(col, row int) int64 { + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row)*16 + 8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// Decimal128Lo returns the low 64 bits of a DECIMAL128 unscaled value. +// Pair with `DecimalScale(col)` to reconstruct the full decimal. +func (b *QwpColumnBatch) Decimal128Lo(col, row int) int64 { + return b.UuidLo(col, row) // same wire layout: 16 LE bytes +} + +// Decimal128Hi returns the high 64 bits of a DECIMAL128 unscaled value. +func (b *QwpColumnBatch) Decimal128Hi(col, row int) int64 { + return b.UuidHi(col, row) +} + +// Long256Word returns word `word` of a LONG256 or DECIMAL256 value at +// (col, row). word=0 is the least-significant 64 bits, word=3 the most. +// Panics on word out of [0,3] regardless of whether the row is NULL — +// that is always programmer error and should not be masked by a NULL. +func (b *QwpColumnBatch) Long256Word(col, row, word int) int64 { + if word < 0 || word > 3 { + panic(fmt.Sprintf("QwpColumnBatch.Long256Word: word %d out of [0,3]", word)) + } + l := &b.layouts[col] + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row)*32 + word*8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// --- Strings, varchars, binary --- +// +// Each zero-copy accessor returns a []byte sub-slice of the frame +// payload. The slice is valid only for the current iteration of +// `*QwpQuery.Batches()`; the next iteration reuses the same underlying +// recv buffer and the bytes are no longer stable. Call `bytes.Clone` +// (or the materializing helper) if you need to retain a value. +// +// The Java client carries two parallel "A/B" view objects per string +// column because each accessor re-points one mutable DirectUtf8String +// and the user needs two slots to hold two views at once. Go slices +// are independent value-copies of a {ptr, len, cap} triple, so every +// call produces an independent view — no A/B distinction needed. + +// Str returns the UTF-8 bytes of a STRING, VARCHAR, SYMBOL, or BINARY +// cell. Returns nil for NULL rows and for any column whose wire type +// is not one of those four — there is no way to distinguish "the row +// is NULL" from "this column is not a string" through the return value +// alone, so callers that care must know the column type up front (e.g. +// from ColumnType). The returned slice aliases the payload; do not +// retain it past the current batch iteration. +func (b *QwpColumnBatch) Str(col, row int) []byte { + l := &b.layouts[col] + if l.isNull(row) { + return nil + } + wt := l.info.wireType + if wt == qwpTypeSymbol { + rowIdx := l.symbolRowIds[row] + if int(rowIdx) >= len(l.symbolDict.entries) { + return nil + } + e := l.symbolDict.entries[rowIdx] + return l.symbolDict.heap[e.offset : e.offset+e.length] + } + if wt == qwpTypeVarchar || wt == qwpTypeBinary { + // Treat BINARY under Str as an opaque byte-bag view for + // callers that want the bytes without explicit BINARY + // typing. The dedicated Binary accessor is the idiomatic + // entry point for BINARY columns. + return qwpStringSlice(l, row) + } + return nil +} + +// String returns the cell at (col, row) as a newly-allocated string. +// Applicable to STRING, VARCHAR, and SYMBOL columns. Returns "" for +// NULL rows. +func (b *QwpColumnBatch) String(col, row int) string { + s := b.Str(col, row) + if s == nil { + return "" + } + return string(s) +} + +// Binary returns the opaque bytes of a BINARY cell. Returns nil for +// NULL rows. The returned slice aliases the payload. +func (b *QwpColumnBatch) Binary(col, row int) []byte { + l := &b.layouts[col] + if l.isNull(row) { + return nil + } + if l.info.wireType != qwpTypeBinary { + return nil + } + return qwpStringSlice(l, row) +} + +// qwpStringSlice implements the shared offset-decode for STRING / +// VARCHAR / BINARY. The `values` region holds a (nonNullCount+1) * +// 4-byte array of uint32 offsets into `stringBytes`; row i covers +// bytes [off[dense], off[dense+1]). +func qwpStringSlice(l *qwpColumnLayout, row int) []byte { + dense := l.denseIndex(row) + start := binary.LittleEndian.Uint32(l.values[dense*4:]) + end := binary.LittleEndian.Uint32(l.values[dense*4+4:]) + return l.stringBytes[start:end] +} + +// --- Arrays --- +// +// The array accessors (ArrayNDims, ArrayDim, Float64Array, Int64Array, +// and the QwpColumn *ArrayInto variants) require a DOUBLE_ARRAY or +// LONG_ARRAY column: they index the decoder's per-array side tables, +// which exist only for those types. Calling one on a non-array column +// panics with a typed message. The element accessors do not distinguish +// DOUBLE_ARRAY from LONG_ARRAY — Int64Array on a DOUBLE_ARRAY column +// reinterprets the 8-byte elements as int64 (numeric noise), the same +// same-width reinterpretation the *Range accessors allow. + +// ArrayNDims returns the dimensionality of the array value at (col, row), +// or 0 for NULL rows. +func (b *QwpColumnBatch) ArrayNDims(col, row int) int { + l := &b.layouts[col] + l.requireArray("QwpColumnBatch.ArrayNDims") + if l.isNull(row) { + return 0 + } + start := l.arrayRowStart[row] + return int(l.values[start]) +} + +// ArrayDim returns the extent of dimension `dim` of the array at +// (col, row). `dim` must be in [0, ArrayNDims(col, row)). +func (b *QwpColumnBatch) ArrayDim(col, row, dim int) int { + l := &b.layouts[col] + l.requireArray("QwpColumnBatch.ArrayDim") + if l.isNull(row) { + return 0 + } + start := int(l.arrayRowStart[row]) + nDims := int(l.values[start]) + if dim < 0 || dim >= nDims { + panic(fmt.Sprintf("QwpColumnBatch.ArrayDim: dim %d out of [0, %d)", dim, nDims)) + } + off := start + 1 + dim*4 + return int(int32(binary.LittleEndian.Uint32(l.values[off : off+4]))) +} + +// arrayElementCount returns the cached element count for the array at +// row `row` in layout `l`, plus the byte offset within `l.values` +// where the flattened data region begins (one byte past the shape +// header). The decoder precomputes the element count into l.arrayElems +// at parse time so per-cell accessors do not re-walk the shape header +// on every call. The decoder also bounds-checks the per-dimension +// extents against qwpMaxArrayElements, so callers that reach this +// helper know the row is non-null and the product fits in int. +func arrayElementCount(l *qwpColumnLayout, row int) (elems, dataBase int) { + start := int(l.arrayRowStart[row]) + nDims := int(l.values[start]) + elems = int(l.arrayElems[row]) + dataBase = start + 1 + nDims*4 + return elems, dataBase +} + +// Float64Array returns the flattened (row-major) elements of a +// DOUBLE_ARRAY cell. Returns nil for NULL rows. The returned slice is a +// fresh []float64 owned by the caller; the payload is memmove'd from +// the wire bytes via an unsafe reinterpretation. Use `ArrayDim` to +// reshape. +// +// Safety: float64 is 8 bytes on every supported architecture and Go +// stores them little-endian on all targets questdb-client supports, so +// the wire layout matches the in-memory layout. The reinterpreted +// source slice is only ever read by `copy`, which lowers to memmove — +// no 8-byte-aligned load is issued against the unaligned payload. +func (b *QwpColumnBatch) Float64Array(col, row int) []float64 { + l := &b.layouts[col] + l.requireArray("QwpColumnBatch.Float64Array") + if l.isNull(row) { + return nil + } + elems, base := arrayElementCount(l, row) + out := make([]float64, elems) + if elems > 0 { + src := unsafe.Slice((*float64)(unsafe.Pointer(&l.values[base])), elems) + copy(out, src) + } + return out +} + +// Int64Array returns the flattened (row-major) elements of a LONG_ARRAY +// cell. Returns nil for NULL rows. See `Float64Array` for the memmove / +// endianness contract. +func (b *QwpColumnBatch) Int64Array(col, row int) []int64 { + l := &b.layouts[col] + l.requireArray("QwpColumnBatch.Int64Array") + if l.isNull(row) { + return nil + } + elems, base := arrayElementCount(l, row) + out := make([]int64, elems) + if elems > 0 { + src := unsafe.Slice((*int64)(unsafe.Pointer(&l.values[base])), elems) + copy(out, src) + } + return out +} + +// QwpColumn is a cached view over a single column of a QwpColumnBatch. +// It captures the column's layout pointer once so per-row accessors +// avoid the per-cell bounds-checked indexing into the batch's layout +// slice. Use this when iterating many rows of one column — the common +// shape for row-major consumers. +// +// Lifetime matches the parent QwpColumnBatch: valid only inside the +// current iteration of *QwpQuery.Batches(). Do not retain past the +// iteration. Returned by value (a layout pointer plus the row count) so +// storing the handle is allocation-free. +type QwpColumn struct { + layout *qwpColumnLayout + rowCount int +} + +// Column returns a cached handle over column `col`. Prefer the handle's +// typed accessors (`Int64(row)`, `Str(row)`, …) when iterating many +// rows of the same column; it eliminates the per-cell `&b.layouts[col]` +// bounds check and slice re-derivation the batch-level accessors pay. +func (b *QwpColumnBatch) Column(col int) QwpColumn { + return QwpColumn{layout: &b.layouts[col], rowCount: b.rowCount} +} + +// Name returns the server-reported column name. +func (c QwpColumn) Name() string { return c.layout.info.name } + +// Type returns the wire-type byte for this column (one of the +// `qwpType*` constants). +func (c QwpColumn) Type() byte { return byte(c.layout.info.wireType) } + +// RowCount returns the row count of the owning batch. +func (c QwpColumn) RowCount() int { return c.rowCount } + +// NonNullCount returns the count of non-null rows in this column. +func (c QwpColumn) NonNullCount() int { return c.layout.nonNullCount } + +// DecimalScale returns the scale for DECIMAL64/128/256 columns; 0 otherwise. +func (c QwpColumn) DecimalScale() int { return int(c.layout.scale) } + +// GeohashPrecisionBits returns the precision in bits for GEOHASH columns. +func (c QwpColumn) GeohashPrecisionBits() int { return int(c.layout.precisionBits) } + +// HasNulls reports whether this column carries a null bitmap in the +// current batch. When false, every per-cell null check resolves to +// false in one branch and Range accessors take the bulk-memmove path. +func (c QwpColumn) HasNulls() bool { return c.layout.nullBitmap != nil } + +// IsNull reports whether the cell at row is NULL. +func (c QwpColumn) IsNull(row int) bool { return c.layout.isNull(row) } + +// Bool returns the BOOLEAN value at row. +func (c QwpColumn) Bool(row int) bool { + l := c.layout + if l.isNull(row) { + return false + } + idx := l.denseIndex(row) + return l.values[idx>>3]&(1<<(idx&7)) != 0 +} + +// Int8 returns the BYTE value at row. +func (c QwpColumn) Int8(row int) int8 { + l := c.layout + if l.isNull(row) { + return 0 + } + return int8(l.values[l.denseIndex(row)]) +} + +// Int16 returns the SHORT value at row. +func (c QwpColumn) Int16(row int) int16 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 2 + return int16(binary.LittleEndian.Uint16(l.values[i : i+2])) +} + +// Char returns the CHAR value at row as a rune (2-byte UTF-16 code unit). +func (c QwpColumn) Char(row int) rune { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 2 + return rune(binary.LittleEndian.Uint16(l.values[i : i+2])) +} + +// Int32 returns the INT or IPv4 value at row. +func (c QwpColumn) Int32(row int) int32 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 4 + return int32(binary.LittleEndian.Uint32(l.values[i : i+4])) +} + +// Int64 returns an 8-byte column value at row (LONG, DATE, TIMESTAMP, +// TIMESTAMP_NANOS, DECIMAL64). +func (c QwpColumn) Int64(row int) int64 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// Float32 returns the FLOAT value at row. +func (c QwpColumn) Float32(row int) float32 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 4 + return math.Float32frombits(binary.LittleEndian.Uint32(l.values[i : i+4])) +} + +// Float64 returns the DOUBLE value at row. +func (c QwpColumn) Float64(row int) float64 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 8 + return math.Float64frombits(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// UuidLo returns the low 64 bits of a UUID at row. +func (c QwpColumn) UuidLo(row int) int64 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row) * 16 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// UuidHi returns the high 64 bits of a UUID at row. +func (c QwpColumn) UuidHi(row int) int64 { + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row)*16 + 8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// Decimal128Lo returns the low 64 bits of a DECIMAL128 unscaled value. +func (c QwpColumn) Decimal128Lo(row int) int64 { return c.UuidLo(row) } + +// Decimal128Hi returns the high 64 bits of a DECIMAL128 unscaled value. +func (c QwpColumn) Decimal128Hi(row int) int64 { return c.UuidHi(row) } + +// Long256Word returns word `word` of a LONG256 or DECIMAL256 value at row. +// Panics on word out of [0,3] regardless of whether the row is NULL — +// that is always programmer error and should not be masked by a NULL. +func (c QwpColumn) Long256Word(row, word int) int64 { + if word < 0 || word > 3 { + panic(fmt.Sprintf("QwpColumn.Long256Word: word %d out of [0,3]", word)) + } + l := c.layout + if l.isNull(row) { + return 0 + } + i := l.denseIndex(row)*32 + word*8 + return int64(binary.LittleEndian.Uint64(l.values[i : i+8])) +} + +// Str returns the UTF-8 bytes of a STRING, VARCHAR, SYMBOL, or BINARY +// cell. Returns nil for NULL rows and for any column whose wire type +// is not one of those four — there is no way to distinguish "the row +// is NULL" from "this column is not a string" through the return value +// alone, so callers that care must know the column type up front (e.g. +// from QwpColumn.Type). The returned slice aliases the payload; do not +// retain past the batch iteration. +func (c QwpColumn) Str(row int) []byte { + l := c.layout + if l.isNull(row) { + return nil + } + wt := l.info.wireType + if wt == qwpTypeSymbol { + rowIdx := l.symbolRowIds[row] + if int(rowIdx) >= len(l.symbolDict.entries) { + return nil + } + e := l.symbolDict.entries[rowIdx] + return l.symbolDict.heap[e.offset : e.offset+e.length] + } + if wt == qwpTypeVarchar || wt == qwpTypeBinary { + return qwpStringSlice(l, row) + } + return nil +} + +// String returns the cell at row as a newly-allocated string. +func (c QwpColumn) String(row int) string { + s := c.Str(row) + if s == nil { + return "" + } + return string(s) +} + +// Binary returns the opaque bytes of a BINARY cell. Returns nil for +// NULL rows. The returned slice aliases the payload. +func (c QwpColumn) Binary(row int) []byte { + l := c.layout + if l.isNull(row) { + return nil + } + if l.info.wireType != qwpTypeBinary { + return nil + } + return qwpStringSlice(l, row) +} + +// ArrayNDims returns the dimensionality of the array at row, or 0 for NULL. +func (c QwpColumn) ArrayNDims(row int) int { + l := c.layout + l.requireArray("QwpColumn.ArrayNDims") + if l.isNull(row) { + return 0 + } + start := l.arrayRowStart[row] + return int(l.values[start]) +} + +// ArrayDim returns the extent of dimension `dim` of the array at row. +func (c QwpColumn) ArrayDim(row, dim int) int { + l := c.layout + l.requireArray("QwpColumn.ArrayDim") + if l.isNull(row) { + return 0 + } + start := int(l.arrayRowStart[row]) + nDims := int(l.values[start]) + if dim < 0 || dim >= nDims { + panic(fmt.Sprintf("QwpColumn.ArrayDim: dim %d out of [0, %d)", dim, nDims)) + } + off := start + 1 + dim*4 + return int(int32(binary.LittleEndian.Uint32(l.values[off : off+4]))) +} + +// Float64Array returns the flattened (row-major) elements of a +// DOUBLE_ARRAY cell. Returns nil for NULL rows. +func (c QwpColumn) Float64Array(row int) []float64 { + l := c.layout + l.requireArray("QwpColumn.Float64Array") + if l.isNull(row) { + return nil + } + elems, base := arrayElementCount(l, row) + out := make([]float64, elems) + if elems > 0 { + src := unsafe.Slice((*float64)(unsafe.Pointer(&l.values[base])), elems) + copy(out, src) + } + return out +} + +// Int64Array returns the flattened (row-major) elements of a LONG_ARRAY +// cell. Returns nil for NULL rows. +func (c QwpColumn) Int64Array(row int) []int64 { + l := c.layout + l.requireArray("QwpColumn.Int64Array") + if l.isNull(row) { + return nil + } + elems, base := arrayElementCount(l, row) + out := make([]int64, elems) + if elems > 0 { + src := unsafe.Slice((*int64)(unsafe.Pointer(&l.values[base])), elems) + copy(out, src) + } + return out +} + +// Float64ArrayInto appends the flattened (row-major) elements of a +// DOUBLE_ARRAY cell at row to dst and returns the extended slice. NULL +// rows contribute nothing — dst is returned unchanged. Use this in hot +// loops where the per-cell allocation of Float64Array would dominate; +// reuse dst across rows by truncating with `dst = dst[:0]` between +// calls. +func (c QwpColumn) Float64ArrayInto(row int, dst []float64) []float64 { + l := c.layout + l.requireArray("QwpColumn.Float64ArrayInto") + if l.isNull(row) { + return dst + } + elems, base := arrayElementCount(l, row) + if elems == 0 { + return dst + } + dstBase := len(dst) + dst = slices.Grow(dst, elems)[:dstBase+elems] + src := unsafe.Slice((*float64)(unsafe.Pointer(&l.values[base])), elems) + copy(dst[dstBase:], src) + return dst +} + +// Int64ArrayInto appends the flattened (row-major) elements of a +// LONG_ARRAY cell at row to dst and returns the extended slice. See +// Float64ArrayInto for the contract — NULL rows contribute nothing. +func (c QwpColumn) Int64ArrayInto(row int, dst []int64) []int64 { + l := c.layout + l.requireArray("QwpColumn.Int64ArrayInto") + if l.isNull(row) { + return dst + } + elems, base := arrayElementCount(l, row) + if elems == 0 { + return dst + } + dstBase := len(dst) + dst = slices.Grow(dst, elems)[:dstBase+elems] + src := unsafe.Slice((*int64)(unsafe.Pointer(&l.values[base])), elems) + copy(dst[dstBase:], src) + return dst +} + +// --- Bulk row-range accessors --- +// +// Each *Range method appends values for rows [fromRow, toRow) onto dst +// and returns the extended slice (the append pattern). NULL rows become +// the zero value of the element type. When the column has no nulls the +// dense region is bulk-copied via a single memmove (identity indexing); +// otherwise the accessor walks the null bitmap once, writing a zero for +// NULL rows and a decoded value for non-NULL rows. +// +// Preallocate dst (e.g. `dst := make([]int64, 0, batch.RowCount())`) to +// keep the common row-sweep path allocation-free. When dst's remaining +// capacity is short, slices.Grow performs one resize. +// +// Each method requires the column to be a fixed-width type of the +// matching element width — 8 bytes for Int64Range / Float64Range, 4 for +// Int32Range / Float32Range. A column of a different width (a bit-packed +// BOOLEAN, a variable-width SYMBOL / VARCHAR / BINARY, or an array) +// panics with a typed message instead of reading past the values +// buffer. Same-width reinterpretation is permitted: Int64Range on a +// DOUBLE column passes the guard and yields the raw 8-byte bits decoded +// as int64 ("numeric noise"), so the caller still owns the +// type-to-semantics match — only the memory-safety failure mode is +// converted into a clear panic. + +// Int64Range appends int64 values for rows [fromRow, toRow). +func (c QwpColumn) Int64Range(fromRow, toRow int, dst []int64) []int64 { + c.layout.requireFixedWidth("QwpColumn.Int64Range", 8) + n := toRow - fromRow + if n <= 0 { + return dst + } + l := c.layout + base := len(dst) + dst = slices.Grow(dst, n)[:base+n] + if l.nullBitmap == nil { + // Bounds-checked sub-slice first so caller misuse panics the + // same way as the per-cell accessor (l.values[i:i+8]); only + // then reinterpret as []int64. + chunk := l.values[fromRow*8 : toRow*8] + src := unsafe.Slice((*int64)(unsafe.Pointer(&chunk[0])), n) + copy(dst[base:], src) + return dst + } + for i := 0; i < n; i++ { + row := fromRow + i + if l.nullBitmap[row>>3]&(1<<(row&7)) != 0 { + dst[base+i] = 0 + continue + } + idx := int(l.nonNullIdx[row]) * 8 + dst[base+i] = int64(binary.LittleEndian.Uint64(l.values[idx : idx+8])) + } + return dst +} + +// Float64Range appends float64 values for rows [fromRow, toRow). +func (c QwpColumn) Float64Range(fromRow, toRow int, dst []float64) []float64 { + c.layout.requireFixedWidth("QwpColumn.Float64Range", 8) + n := toRow - fromRow + if n <= 0 { + return dst + } + l := c.layout + base := len(dst) + dst = slices.Grow(dst, n)[:base+n] + if l.nullBitmap == nil { + chunk := l.values[fromRow*8 : toRow*8] + src := unsafe.Slice((*float64)(unsafe.Pointer(&chunk[0])), n) + copy(dst[base:], src) + return dst + } + for i := 0; i < n; i++ { + row := fromRow + i + if l.nullBitmap[row>>3]&(1<<(row&7)) != 0 { + dst[base+i] = 0 + continue + } + idx := int(l.nonNullIdx[row]) * 8 + dst[base+i] = math.Float64frombits(binary.LittleEndian.Uint64(l.values[idx : idx+8])) + } + return dst +} + +// Int32Range appends int32 values for rows [fromRow, toRow). +func (c QwpColumn) Int32Range(fromRow, toRow int, dst []int32) []int32 { + c.layout.requireFixedWidth("QwpColumn.Int32Range", 4) + n := toRow - fromRow + if n <= 0 { + return dst + } + l := c.layout + base := len(dst) + dst = slices.Grow(dst, n)[:base+n] + if l.nullBitmap == nil { + chunk := l.values[fromRow*4 : toRow*4] + src := unsafe.Slice((*int32)(unsafe.Pointer(&chunk[0])), n) + copy(dst[base:], src) + return dst + } + for i := 0; i < n; i++ { + row := fromRow + i + if l.nullBitmap[row>>3]&(1<<(row&7)) != 0 { + dst[base+i] = 0 + continue + } + idx := int(l.nonNullIdx[row]) * 4 + dst[base+i] = int32(binary.LittleEndian.Uint32(l.values[idx : idx+4])) + } + return dst +} + +// Float32Range appends float32 values for rows [fromRow, toRow). +func (c QwpColumn) Float32Range(fromRow, toRow int, dst []float32) []float32 { + c.layout.requireFixedWidth("QwpColumn.Float32Range", 4) + n := toRow - fromRow + if n <= 0 { + return dst + } + l := c.layout + base := len(dst) + dst = slices.Grow(dst, n)[:base+n] + if l.nullBitmap == nil { + chunk := l.values[fromRow*4 : toRow*4] + src := unsafe.Slice((*float32)(unsafe.Pointer(&chunk[0])), n) + copy(dst[base:], src) + return dst + } + for i := 0; i < n; i++ { + row := fromRow + i + if l.nullBitmap[row>>3]&(1<<(row&7)) != 0 { + dst[base+i] = 0 + continue + } + idx := int(l.nonNullIdx[row]) * 4 + dst[base+i] = math.Float32frombits(binary.LittleEndian.Uint32(l.values[idx : idx+4])) + } + return dst +} + +// --- Materializing escape hatch --- + +// CopyAll materialises the batch into a heap-owned *QwpColumnBatch that +// the caller may retain past the current iteration of +// *QwpQuery.Batches(). The I/O goroutine's decoder reuses its per-column +// layout pool on the next frame, so a batch yielded by Batches() is only +// valid for the current iteration; CopyAll is the escape hatch. Every +// typed accessor (Int64, Str, Float64Array, …) works identically on the +// copy. +// +// The copy differs from a live batch in two ways, both invisible to +// callers: +// +// 1. The pool-owned layout arrays (nonNullIdx, symbolRowIds, +// arrayRowStart, arrayElems, timestampBuf) are freshly-allocated +// heap slices, not aliases into the decoder's reused pool. +// 2. The payload bytes are deep-cloned, and every layout slice that +// aliased the source payload (values, stringBytes, nullBitmap) is +// re-pointed at the clone via offset translation, so the copy is +// independent of the source's backing buffer. +// +// Both transport paths produce copies that survive reuse: the zstd +// path's `payload` aliased the per-batch decompression scratch the +// decoder reuses across decodes into the same QwpColumnBatch, and the +// raw path's `payload` aliased the recycled WS read buffer the egress +// I/O loop returns to qwpEgressIO.readBufPool on releaseBuffer (see +// qwp_query_io.go). Cloning covers both. +// +// Cost: one []qwpColumnLayout slice + one fresh backing slice per +// pool-owned layout field, plus a one-shot deep clone of the payload +// bytes. +func (b *QwpColumnBatch) CopyAll() *QwpColumnBatch { + sb := &QwpColumnBatch{ + requestId: b.requestId, + batchSeq: b.batchSeq, + rowCount: b.rowCount, + columnCount: b.columnCount, + columns: b.columns, + layouts: make([]qwpColumnLayout, b.columnCount), + } + // Both transport paths recycle the buffer payload aliases — the + // per-batch zstdScratch on the compressed path, the readBufPool WS + // read buffer on the raw path. Clone the whole payload once and + // translate every aliasing layout slice onto the clone, so the + // snapshot is independent of later decodes / pool reuse. + srcPayload := b.payload + clonedPayload := slices.Clone(srcPayload) + sb.payload = clonedPayload + if len(b.zstdScratch) > 0 { + // Mirror the source's shape: a snapshot built from a compressed + // batch keeps its payload addressable as zstdScratch too, since + // on the source the two slice headers pointed at the same bytes. + sb.zstdScratch = clonedPayload + } + for i := 0; i < b.columnCount; i++ { + src := &b.layouts[i] + dst := &sb.layouts[i] + dst.info = src.info + dst.scale = src.scale + dst.precisionBits = src.precisionBits + // nullBitmap: aliases payload for server-sent bitmaps; owned heap + // buffer after array nDims=0 NULL promotion. Either way, retaining + // the slice header keeps the backing array reachable for the life + // of the copied batch. + dst.nullBitmap = rebindIfAliased(src.nullBitmap, srcPayload, clonedPayload) + dst.nonNullCount = src.nonNullCount + dst.nonNullIdx = slices.Clone(src.nonNullIdx) + dst.values = rebindIfAliased(src.values, srcPayload, clonedPayload) + dst.stringBytes = rebindIfAliased(src.stringBytes, srcPayload, clonedPayload) + dst.symbolRowIds = slices.Clone(src.symbolRowIds) + // symbolDict snapshot: heap + entries lengths are frozen at + // snapshot time and the decoder only ever append-extends them, + // so the view stays valid without copying. + dst.symbolDict = src.symbolDict + dst.arrayRowStart = slices.Clone(src.arrayRowStart) + dst.arrayElems = slices.Clone(src.arrayElems) + dst.timestampBuf = slices.Clone(src.timestampBuf) + // Gorilla TIMESTAMP: values aliases timestampBuf (not payload). + // Re-point at the cloned buffer so the snapshot survives the + // decoder reusing the source's timestampBuf on a later decode. + // Detected by timestampBuf being non-empty — parseTimestamp's + // non-Gorilla branches leave it cleared to :0. + if len(src.timestampBuf) > 0 { + dst.values = int64sAsBytes(dst.timestampBuf) + } + } + return sb +} + +// rebindIfAliased returns src unchanged when it doesn't alias +// srcPayload — heap-owned slices (`int64sAsBytes(timestampBuf)`, +// promoted array null bitmaps) fall through as-is so the caller's +// follow-up branches can re-point them explicitly. When src does +// alias, the function translates its offset+length onto clonedPayload +// so the snapshot references the clone rather than the source's +// reusable buffer. The empty-src early return guards the &src[0] +// address read below. +func rebindIfAliased(src, srcPayload, clonedPayload []byte) []byte { + if len(src) == 0 { + return src + } + if !aliases(src, srcPayload) { + return src + } + offset := int(uintptr(unsafe.Pointer(&src[0])) - uintptr(unsafe.Pointer(&srcPayload[0]))) + return clonedPayload[offset : offset+len(src)] +} + +// aliases reports whether sub points into the backing array of parent. +// Compares addresses directly — the slice headers may have different +// lengths, so len-based checks are not sufficient. +func aliases(sub, parent []byte) bool { + if len(sub) == 0 || len(parent) == 0 { + return false + } + subAddr := uintptr(unsafe.Pointer(&sub[0])) + parentAddr := uintptr(unsafe.Pointer(&parent[0])) + parentEnd := parentAddr + uintptr(len(parent)) + return subAddr >= parentAddr && subAddr+uintptr(len(sub)) <= parentEnd +} diff --git a/qwp_query_batch_perf_test.go b/qwp_query_batch_perf_test.go new file mode 100644 index 00000000..84e5e5e3 --- /dev/null +++ b/qwp_query_batch_perf_test.go @@ -0,0 +1,274 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "testing" +) + +const perfN = 10000 + +func perfFixedInt64Batch(n int) *QwpColumnBatch { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + values := make([]byte, 8*n) + for i := 0; i < n; i++ { + binary.LittleEndian.PutUint64(values[i*8:], uint64(i)) + } + layout := buildFixedLayout(&info, values, n) + return newSingleColumnBatch(info, layout, n) +} + +func perfNullableInt64Batch(n int) *QwpColumnBatch { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + rowBytes := make([][]byte, n) + for i := 0; i < n; i++ { + if i%4 == 0 { + rowBytes[i] = nil + } else { + rowBytes[i] = binary.LittleEndian.AppendUint64(nil, uint64(i)) + } + } + layout := buildNullableLayout(&info, rowBytes) + return newSingleColumnBatch(info, layout, n) +} + +// BenchmarkBatchInt64PerCell measures the batch-level (col, row) accessor. +// After the delegation refactor this routes through Column(col).Int64(row); +// the question is whether the Go inliner fully elides the handle-construction. +func BenchmarkBatchInt64PerCell(b *testing.B) { + batch := perfFixedInt64Batch(perfN) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for r := 0; r < perfN; r++ { + sink ^= batch.Int64(0, r) + } + } + _ = sink +} + +// BenchmarkColumnInt64PerCell is the control: Column-handle path, identical +// before and after. Differences here would point at noise in the harness. +func BenchmarkColumnInt64PerCell(b *testing.B) { + batch := perfFixedInt64Batch(perfN) + col := batch.Column(0) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for r := 0; r < perfN; r++ { + sink ^= col.Int64(r) + } + } + _ = sink +} + +// BenchmarkInt64RangeNoNulls measures the bulk fast path. The fix replaced +// raw `unsafe.Slice` from `&l.values[byteStart]` with a bounds-checked +// sub-slice expression — should be a wash, but worth confirming. +func BenchmarkInt64RangeNoNulls(b *testing.B) { + batch := perfFixedInt64Batch(perfN) + col := batch.Column(0) + dst := make([]int64, 0, perfN) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dst = dst[:0] + dst = col.Int64Range(0, perfN, dst) + } + _ = dst +} + +// BenchmarkInt64RangeWithNulls measures the per-row scalar loop. Untouched +// by the changes; included as a second control. +func BenchmarkInt64RangeWithNulls(b *testing.B) { + batch := perfNullableInt64Batch(perfN) + col := batch.Column(0) + dst := make([]int64, 0, perfN) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dst = dst[:0] + dst = col.Int64Range(0, perfN, dst) + } + _ = dst +} + +// --- Multi-column / wide-row patterns --- +// +// The single-column benchmarks above let the Go inliner hoist +// b.layouts[0] out of the inner loop because the column index is a +// loop-invariant literal — so they understate the column-handle's +// theoretical win. The benchmarks below measure shapes where the +// compiler cannot do that lift. + +const ( + perfRows = 1000 + perfCols = 16 +) + +func perfMultiColInt64Batch(rows, cols int) *QwpColumnBatch { + infos := make([]qwpColumnSchemaInfo, cols) + layouts := make([]qwpColumnLayout, cols) + values := make([]byte, 8*rows) + for i := 0; i < rows; i++ { + binary.LittleEndian.PutUint64(values[i*8:], uint64(i)) + } + for c := 0; c < cols; c++ { + infos[c] = qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + layouts[c] = qwpColumnLayout{ + info: &infos[c], + values: values, + nonNullCount: rows, + } + } + return &QwpColumnBatch{ + requestId: 1, + rowCount: rows, + columnCount: cols, + columns: infos, + layouts: layouts, + } +} + +// BenchmarkBatchMultiColRowMajor: row-major full-batch scan via the +// (col, row) batch surface. Column index varies inside the inner loop, +// so b.layouts[c] is rebound every cell — the workload the original +// review comment described. +func BenchmarkBatchMultiColRowMajor(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for r := 0; r < perfRows; r++ { + for c := 0; c < perfCols; c++ { + sink ^= batch.Int64(c, r) + } + } + } + _ = sink +} + +// BenchmarkColumnMultiColRowMajor: same access pattern, but each +// column's QwpColumn handle is captured once up-front so the inner +// loop hits a hoisted *qwpColumnLayout. This is the "use the handle" +// variant of the row-major scan. +func BenchmarkColumnMultiColRowMajor(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + cols := make([]QwpColumn, perfCols) + for c := 0; c < perfCols; c++ { + cols[c] = batch.Column(c) + } + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for r := 0; r < perfRows; r++ { + for c := 0; c < perfCols; c++ { + sink ^= cols[c].Int64(r) + } + } + } + _ = sink +} + +// BenchmarkBatchColumnMajor: column-major scan via the batch surface. +// The column index is invariant in the inner loop; the compiler may or +// may not hoist b.layouts[c] out — this shows whether it does. +func BenchmarkBatchColumnMajor(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for c := 0; c < perfCols; c++ { + for r := 0; r < perfRows; r++ { + sink ^= batch.Int64(c, r) + } + } + } + _ = sink +} + +// BenchmarkColumnMajorHandle: column-major scan via QwpColumn handles +// captured per outer iteration — the textbook fast path. +func BenchmarkColumnMajorHandle(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for c := 0; c < perfCols; c++ { + col := batch.Column(c) + for r := 0; r < perfRows; r++ { + sink ^= col.Int64(r) + } + } + } + _ = sink +} + +// BenchmarkColumnMajorRange: column-major scan via Int64Range with a +// per-row consumer (XOR sum). Realistic: caller does *something* with +// each value, so this is "Range plus typical processing." +func BenchmarkColumnMajorRange(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + dst := make([]int64, 0, perfRows) + b.ReportAllocs() + b.ResetTimer() + var sink int64 + for i := 0; i < b.N; i++ { + for c := 0; c < perfCols; c++ { + col := batch.Column(c) + dst = dst[:0] + dst = col.Int64Range(0, perfRows, dst) + for _, v := range dst { + sink ^= v + } + } + } + _ = sink +} + +// BenchmarkColumnMajorRangePure: column-major Range with NO per-row +// consumer. Measures the bulk read in isolation — the upper bound for +// "Range vs per-cell" speedup. +func BenchmarkColumnMajorRangePure(b *testing.B) { + batch := perfMultiColInt64Batch(perfRows, perfCols) + dst := make([]int64, 0, perfRows) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for c := 0; c < perfCols; c++ { + col := batch.Column(c) + dst = dst[:0] + dst = col.Int64Range(0, perfRows, dst) + } + } + _ = dst +} diff --git a/qwp_query_batch_test.go b/qwp_query_batch_test.go new file mode 100644 index 00000000..49020a88 --- /dev/null +++ b/qwp_query_batch_test.go @@ -0,0 +1,1566 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "strings" + "sync" + "testing" +) + +// buildFixedLayout produces a qwpColumnLayout with no nulls and the +// given values region. Used as a helper across the fixed-width tests. +func buildFixedLayout(info *qwpColumnSchemaInfo, values []byte, rowCount int) qwpColumnLayout { + return qwpColumnLayout{ + info: info, + values: values, + nonNullCount: rowCount, + } +} + +// buildNullableLayout produces a qwpColumnLayout with the given null +// pattern (true = NULL) and a dense values region assembled from the +// non-null rows of `rowBytes`. `rowBytes` must contain one entry per +// row (nil for NULL rows, fixed-size bytes for non-null). +func buildNullableLayout(info *qwpColumnSchemaInfo, rowBytes [][]byte) qwpColumnLayout { + rowCount := len(rowBytes) + bitmap := make([]byte, (rowCount+7)>>3) + nonNullIdx := make([]int32, rowCount) + var dense int32 + var values []byte + for i, b := range rowBytes { + if b == nil { + bitmap[i>>3] |= 1 << (i & 7) + nonNullIdx[i] = -1 + } else { + nonNullIdx[i] = dense + dense++ + values = append(values, b...) + } + } + return qwpColumnLayout{ + info: info, + nullBitmap: bitmap, + nonNullIdx: nonNullIdx, + values: values, + nonNullCount: int(dense), + } +} + +// newSingleColumnBatch assembles a QwpColumnBatch with one column for +// tests that only care about a single accessor path. +func newSingleColumnBatch(info qwpColumnSchemaInfo, layout qwpColumnLayout, rowCount int) *QwpColumnBatch { + return &QwpColumnBatch{ + requestId: 1, + batchSeq: 0, + rowCount: rowCount, + columnCount: 1, + columns: []qwpColumnSchemaInfo{info}, + layouts: []qwpColumnLayout{layout}, + } +} + +// --- Fixed-width accessor coverage --- + +func TestQwpColumnBatchFixedWidth(t *testing.T) { + t.Run("Bool_bitpacked", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "b", wireType: qwpTypeBoolean} + // 10 rows, pattern: T F T F T F T F T F. + // Packed: byte 0 bits 0..7 = 0b01010101 = 0x55, byte 1 bits 0..1 = 0b01 = 0x01. + layout := buildFixedLayout(&info, []byte{0x55, 0x01}, 10) + batch := newSingleColumnBatch(info, layout, 10) + for i := 0; i < 10; i++ { + want := i%2 == 0 + if got := batch.Bool(0, i); got != want { + t.Fatalf("Bool(0, %d) = %v, want %v", i, got, want) + } + } + }) + + t.Run("Int8", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "b", wireType: qwpTypeByte} + layout := buildFixedLayout(&info, []byte{0x01, 0xFF, 0x7F}, 3) + batch := newSingleColumnBatch(info, layout, 3) + if got := batch.Int8(0, 0); got != 1 { + t.Fatalf("Int8(0, 0) = %d", got) + } + if got := batch.Int8(0, 1); got != -1 { + t.Fatalf("Int8(0, 1) = %d", got) + } + if got := batch.Int8(0, 2); got != 127 { + t.Fatalf("Int8(0, 2) = %d", got) + } + }) + + t.Run("Int16", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "s", wireType: qwpTypeShort} + values := make([]byte, 4) + var negShort int16 = -1000 + binary.LittleEndian.PutUint16(values[0:], uint16(negShort)) + binary.LittleEndian.PutUint16(values[2:], 32767) + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Int16(0, 0); got != -1000 { + t.Fatalf("Int16[0] = %d", got) + } + if got := batch.Int16(0, 1); got != 32767 { + t.Fatalf("Int16[1] = %d", got) + } + }) + + t.Run("Char", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "c", wireType: qwpTypeChar} + values := make([]byte, 4) + binary.LittleEndian.PutUint16(values[0:], 0x0041) // 'A' + binary.LittleEndian.PutUint16(values[2:], 0x00E9) // 'é' + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Char(0, 0); got != 'A' { + t.Fatalf("Char[0] = %c (%d)", got, got) + } + if got := batch.Char(0, 1); got != 'é' { + t.Fatalf("Char[1] = %c (%d)", got, got) + } + }) + + t.Run("Int32_and_IPv4", func(t *testing.T) { + // INT and IPv4 share the 4-byte LE wire layout. + values := make([]byte, 8) + var negInt int32 = -42 + binary.LittleEndian.PutUint32(values[0:], uint32(negInt)) + binary.LittleEndian.PutUint32(values[4:], 0x7F_00_00_01) // 127.0.0.1 LE + for _, wt := range []qwpTypeCode{qwpTypeInt, qwpTypeIPv4} { + info := qwpColumnSchemaInfo{name: "i", wireType: wt} + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Int32(0, 0); got != -42 { + t.Fatalf("Int32 (%#x) [0] = %d", wt, got) + } + if got := batch.Int32(0, 1); got != int32(0x7F_00_00_01) { + t.Fatalf("Int32 (%#x) [1] = %#x", wt, got) + } + } + }) + + t.Run("Int64", func(t *testing.T) { + // LONG, DATE, TIMESTAMP, TIMESTAMP_NANOS, DECIMAL64 all share + // the int64 LE layout. Spot-check the dispatch through the + // single accessor. + values := make([]byte, 16) + var negLong int64 = -1 + binary.LittleEndian.PutUint64(values[0:], uint64(negLong)) + binary.LittleEndian.PutUint64(values[8:], uint64(math.MaxInt64)) + for _, wt := range []qwpTypeCode{qwpTypeLong, qwpTypeDate, qwpTypeTimestamp, qwpTypeTimestampNano, qwpTypeDecimal64} { + info := qwpColumnSchemaInfo{name: "l", wireType: wt} + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Int64(0, 0); got != -1 { + t.Fatalf("Int64 (%#x) [0] = %d", wt, got) + } + if got := batch.Int64(0, 1); got != math.MaxInt64 { + t.Fatalf("Int64 (%#x) [1] = %d", wt, got) + } + } + }) + + t.Run("Float32", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "f", wireType: qwpTypeFloat} + values := make([]byte, 8) + binary.LittleEndian.PutUint32(values[0:], math.Float32bits(3.14)) + binary.LittleEndian.PutUint32(values[4:], math.Float32bits(-0.5)) + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Float32(0, 0); got != 3.14 { + t.Fatalf("Float32[0] = %v", got) + } + if got := batch.Float32(0, 1); got != -0.5 { + t.Fatalf("Float32[1] = %v", got) + } + }) + + t.Run("Float64", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "d", wireType: qwpTypeDouble} + values := make([]byte, 16) + binary.LittleEndian.PutUint64(values[0:], math.Float64bits(1.3)) + binary.LittleEndian.PutUint64(values[8:], math.Float64bits(-2.5)) + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + if got := batch.Float64(0, 0); got != 1.3 { + t.Fatalf("Float64[0] = %v", got) + } + if got := batch.Float64(0, 1); got != -2.5 { + t.Fatalf("Float64[1] = %v", got) + } + }) + + t.Run("Uuid", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "u", wireType: qwpTypeUuid} + values := make([]byte, 16) + binary.LittleEndian.PutUint64(values[0:], 0x0706050403020100) + binary.LittleEndian.PutUint64(values[8:], 0x0F0E0D0C0B0A0908) + layout := buildFixedLayout(&info, values, 1) + batch := newSingleColumnBatch(info, layout, 1) + if lo := batch.UuidLo(0, 0); lo != 0x0706050403020100 { + t.Fatalf("UuidLo = %#x", lo) + } + if hi := batch.UuidHi(0, 0); hi != 0x0F0E0D0C0B0A0908 { + t.Fatalf("UuidHi = %#x", hi) + } + }) + + t.Run("Decimal128", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "d128", wireType: qwpTypeDecimal128} + values := make([]byte, 16) + binary.LittleEndian.PutUint64(values[0:], 0xAAAA_BBBB_CCCC_DDDD) + binary.LittleEndian.PutUint64(values[8:], 0x1111_2222_3333_4444) + layout := buildFixedLayout(&info, values, 1) + layout.scale = 4 + batch := newSingleColumnBatch(info, layout, 1) + if got := batch.Decimal128Lo(0, 0); uint64(got) != 0xAAAA_BBBB_CCCC_DDDD { + t.Fatalf("Decimal128Lo = %#x", uint64(got)) + } + if got := batch.Decimal128Hi(0, 0); uint64(got) != 0x1111_2222_3333_4444 { + t.Fatalf("Decimal128Hi = %#x", uint64(got)) + } + if s := batch.DecimalScale(0); s != 4 { + t.Fatalf("DecimalScale = %d, want 4", s) + } + }) + + t.Run("Long256_and_Decimal256", func(t *testing.T) { + for _, wt := range []qwpTypeCode{qwpTypeLong256, qwpTypeDecimal256} { + info := qwpColumnSchemaInfo{name: "l256", wireType: wt} + values := make([]byte, 32) + for i := 0; i < 4; i++ { + binary.LittleEndian.PutUint64(values[i*8:], uint64(i+1)*0x1111111111111111) + } + layout := buildFixedLayout(&info, values, 1) + batch := newSingleColumnBatch(info, layout, 1) + for w := 0; w < 4; w++ { + want := int64(uint64(w+1) * 0x1111111111111111) + if got := batch.Long256Word(0, 0, w); got != want { + t.Fatalf("%#x word %d = %#x", wt, w, got) + } + } + } + }) +} + +// --- Null handling --- + +func TestQwpColumnBatchNullsDenseIndex(t *testing.T) { + // Pattern N V V N V (rowCount=5, denseCount=3). Non-null values: + // int32 values 100, 200, 300 at dense indices 0, 1, 2. + info := qwpColumnSchemaInfo{name: "i", wireType: qwpTypeInt} + values := make([]byte, 12) + binary.LittleEndian.PutUint32(values[0:], 100) + binary.LittleEndian.PutUint32(values[4:], 200) + binary.LittleEndian.PutUint32(values[8:], 300) + rowBytes := [][]byte{ + nil, // row 0 NULL + values[0:4], + values[4:8], + nil, // row 3 NULL + values[8:12], + } + layout := buildNullableLayout(&info, rowBytes) + batch := newSingleColumnBatch(info, layout, 5) + + if !batch.IsNull(0, 0) || !batch.IsNull(0, 3) { + t.Fatal("row 0 and 3 should be NULL") + } + if batch.IsNull(0, 1) || batch.IsNull(0, 2) || batch.IsNull(0, 4) { + t.Fatal("non-null rows must not report as NULL") + } + want := []int32{0, 100, 200, 0, 300} + for i, w := range want { + if got := batch.Int32(0, i); got != w { + t.Fatalf("Int32(0, %d) = %d, want %d", i, got, w) + } + } + if c := batch.NonNullCount(0); c != 3 { + t.Fatalf("NonNullCount = %d, want 3", c) + } +} + +func TestQwpColumnBatchNullableAllNulls(t *testing.T) { + // Every row NULL: nonNullCount=0, every accessor returns zero. + info := qwpColumnSchemaInfo{name: "x", wireType: qwpTypeLong} + rowBytes := [][]byte{nil, nil, nil} + layout := buildNullableLayout(&info, rowBytes) + batch := newSingleColumnBatch(info, layout, 3) + for i := 0; i < 3; i++ { + if !batch.IsNull(0, i) { + t.Fatalf("row %d should be NULL", i) + } + if v := batch.Int64(0, i); v != 0 { + t.Fatalf("Int64(0, %d) = %d, want 0", i, v) + } + } + if c := batch.NonNullCount(0); c != 0 { + t.Fatalf("NonNullCount = %d, want 0", c) + } +} + +// --- Strings, varchars, binary --- + +func buildStringLayout(info *qwpColumnSchemaInfo, values []string) qwpColumnLayout { + // Offsets array: (len(values)+1) uint32 LE, then concatenated bytes. + offsets := make([]byte, 4*(len(values)+1)) + var heap []byte + var cur uint32 + for i, s := range values { + binary.LittleEndian.PutUint32(offsets[i*4:], cur) + heap = append(heap, s...) + cur += uint32(len(s)) + } + binary.LittleEndian.PutUint32(offsets[len(values)*4:], cur) + return qwpColumnLayout{ + info: info, + values: offsets, + stringBytes: heap, + nonNullCount: len(values), + } +} + +func TestQwpColumnBatchStringsAndVarcharsAndBinary(t *testing.T) { + for _, tc := range []struct { + name string + wt qwpTypeCode + }{ + {"VARCHAR", qwpTypeVarchar}, + {"BINARY", qwpTypeBinary}, + } { + t.Run(tc.name, func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "s", wireType: tc.wt} + vals := []string{"", "hello", "日本語", "x"} + layout := buildStringLayout(&info, vals) + batch := newSingleColumnBatch(info, layout, len(vals)) + for i, v := range vals { + var got []byte + if tc.wt == qwpTypeBinary { + got = batch.Binary(0, i) + } else { + got = batch.Str(0, i) + } + if !bytes.Equal(got, []byte(v)) { + t.Fatalf("%s row %d: got %q, want %q", tc.name, i, got, v) + } + } + // Two accessor calls return independent slice values + // (different Go slice headers), even though they alias + // the same backing bytes. + if tc.wt == qwpTypeVarchar { + a := batch.Str(0, 1) + b := batch.Str(0, 2) + if bytes.Equal(a, b) { + t.Fatalf("independent views should differ: a=%q b=%q", a, b) + } + } + }) + } +} + +func TestQwpColumnBatchStringAllocatingHelper(t *testing.T) { + info := qwpColumnSchemaInfo{name: "s", wireType: qwpTypeVarchar} + vals := []string{"hello", "", "world"} + layout := buildStringLayout(&info, vals) + batch := newSingleColumnBatch(info, layout, len(vals)) + if got := batch.String(0, 0); got != "hello" { + t.Fatalf("String[0] = %q", got) + } + if got := batch.String(0, 2); got != "world" { + t.Fatalf("String[2] = %q", got) + } +} + +// --- Symbol --- + +func TestQwpColumnBatchSymbol(t *testing.T) { + info := qwpColumnSchemaInfo{name: "sy", wireType: qwpTypeSymbol} + // Dict: ["alpha", "beta", "gamma"], one heap region with packed + // (offset, length) entries. + heap := []byte("alphabetagamma") + entries := []qwpSymbolEntry{ + {offset: 0, length: 5}, + {offset: 5, length: 4}, + {offset: 9, length: 5}, + } + dict := qwpSymbolDictView{heap: heap, entries: entries} + + // Four rows: alpha, beta, NULL, gamma. + rowCount := 4 + bitmap := make([]byte, 1) + bitmap[0] = 1 << 2 // row 2 NULL + nonNullIdx := []int32{0, 1, -1, 2} + symbolRowIds := []int32{0, 1, 0 /* stale, row is NULL */, 2} + + layout := qwpColumnLayout{ + info: &info, + nullBitmap: bitmap, + nonNullIdx: nonNullIdx, + nonNullCount: 3, + symbolRowIds: symbolRowIds, + symbolDict: dict, + } + batch := newSingleColumnBatch(info, layout, rowCount) + + want := []string{"alpha", "beta", "", "gamma"} + for i, w := range want { + if got := batch.String(0, i); got != w { + t.Fatalf("Symbol row %d: got %q, want %q", i, got, w) + } + } + if !batch.IsNull(0, 2) { + t.Fatalf("row 2 must be NULL") + } +} + +// --- Arrays --- + +func TestQwpColumnBatchFloat64Array1D(t *testing.T) { + // One row: 1D array [1.5, 2.5, 3.5]. + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeDoubleArray} + var buf bytes.Buffer + buf.WriteByte(1) // nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(3)) + _ = binary.Write(&buf, binary.LittleEndian, 1.5) + _ = binary.Write(&buf, binary.LittleEndian, 2.5) + _ = binary.Write(&buf, binary.LittleEndian, 3.5) + values := buf.Bytes() + + layout := qwpColumnLayout{ + info: &info, + values: values, + arrayRowStart: []int32{0}, + arrayElems: []int32{3}, + nonNullCount: 1, + } + batch := newSingleColumnBatch(info, layout, 1) + + if n := batch.ArrayNDims(0, 0); n != 1 { + t.Fatalf("ArrayNDims = %d", n) + } + if d := batch.ArrayDim(0, 0, 0); d != 3 { + t.Fatalf("ArrayDim(0) = %d", d) + } + got := batch.Float64Array(0, 0) + want := []float64{1.5, 2.5, 3.5} + for i := range want { + if got[i] != want[i] { + t.Fatalf("Float64Array[%d] = %v, want %v", i, got[i], want[i]) + } + } +} + +func TestQwpColumnBatchInt64Array2D(t *testing.T) { + // One row: 2×3 array, row-major: [[1,2,3],[4,5,6]]. + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeLongArray} + var buf bytes.Buffer + buf.WriteByte(2) // nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(2)) + _ = binary.Write(&buf, binary.LittleEndian, int32(3)) + for _, v := range []int64{1, 2, 3, 4, 5, 6} { + _ = binary.Write(&buf, binary.LittleEndian, v) + } + values := buf.Bytes() + + layout := qwpColumnLayout{ + info: &info, + values: values, + arrayRowStart: []int32{0}, + arrayElems: []int32{6}, + nonNullCount: 1, + } + batch := newSingleColumnBatch(info, layout, 1) + + if n := batch.ArrayNDims(0, 0); n != 2 { + t.Fatalf("ArrayNDims = %d", n) + } + if d0, d1 := batch.ArrayDim(0, 0, 0), batch.ArrayDim(0, 0, 1); d0 != 2 || d1 != 3 { + t.Fatalf("ArrayDim = %dx%d", d0, d1) + } + got := batch.Int64Array(0, 0) + want := []int64{1, 2, 3, 4, 5, 6} + for i := range want { + if got[i] != want[i] { + t.Fatalf("Int64Array[%d] = %d", i, got[i]) + } + } +} + +func TestQwpColumnBatchEmptyArrayViaZeroShape(t *testing.T) { + // A non-null 1-D empty array is encoded as (nDims=1, dim0=0): 5 + // bytes of shape, 0 bytes of elements. Distinct from a NULL row + // (null bitmap bit set, no inline bytes) — accessors should + // report a real 1-D array with zero length. + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeDoubleArray} + var buf bytes.Buffer + buf.WriteByte(1) // nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(0)) + values := buf.Bytes() + layout := qwpColumnLayout{ + info: &info, + values: values, + arrayRowStart: []int32{0}, + arrayElems: []int32{0}, + nonNullCount: 1, + } + batch := newSingleColumnBatch(info, layout, 1) + if n := batch.ArrayNDims(0, 0); n != 1 { + t.Fatalf("ArrayNDims = %d, want 1", n) + } + if d := batch.ArrayDim(0, 0, 0); d != 0 { + t.Fatalf("ArrayDim(0) = %d, want 0", d) + } + if got := batch.Float64Array(0, 0); len(got) != 0 { + t.Fatalf("Float64Array len = %d, want 0", len(got)) + } +} + +// TestQwpColumnFloat64ArrayInto exercises the append-into-dst variant +// of Float64Array: it must extend dst with the row's elements, leave +// dst unchanged on a NULL row, and reuse dst's backing array across +// successive calls (the hot-loop pattern this accessor exists for). +func TestQwpColumnFloat64ArrayInto(t *testing.T) { + // Two non-null rows back-to-back: row 0 = [1.5, 2.5], row 1 = [3.5]. + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeDoubleArray} + var buf bytes.Buffer + buf.WriteByte(1) // row 0 nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(2)) + _ = binary.Write(&buf, binary.LittleEndian, 1.5) + _ = binary.Write(&buf, binary.LittleEndian, 2.5) + row1Start := int32(buf.Len()) + buf.WriteByte(1) // row 1 nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(1)) + _ = binary.Write(&buf, binary.LittleEndian, 3.5) + values := buf.Bytes() + + layout := qwpColumnLayout{ + info: &info, + values: values, + arrayRowStart: []int32{0, row1Start}, + arrayElems: []int32{2, 1}, + nonNullCount: 2, + } + batch := newSingleColumnBatch(info, layout, 2) + col := batch.Column(0) + + dst := make([]float64, 0, 8) + dst = col.Float64ArrayInto(0, dst) + if len(dst) != 2 || dst[0] != 1.5 || dst[1] != 2.5 { + t.Fatalf("row 0 into dst = %v", dst) + } + // Append-style: a second call without truncating extends dst. + dst = col.Float64ArrayInto(1, dst) + if len(dst) != 3 || dst[2] != 3.5 { + t.Fatalf("row 1 appended dst = %v", dst) + } + // Hot-loop pattern: truncate before each row to reuse the backing + // array. Capacity must be preserved across the truncation. + beforeCap := cap(dst) + dst = dst[:0] + dst = col.Float64ArrayInto(0, dst) + if len(dst) != 2 || cap(dst) != beforeCap { + t.Fatalf("reuse: len=%d cap=%d (was %d)", len(dst), cap(dst), beforeCap) + } +} + +// TestQwpColumnFloat64ArrayIntoNull verifies that a NULL row leaves +// dst unchanged (no zero-fill, no truncation) — distinct from the +// per-cell Float64Array which returns nil for NULL. +func TestQwpColumnFloat64ArrayIntoNull(t *testing.T) { + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeDoubleArray} + // Null bitmap has bit 0 set → row 0 is NULL. + layout := qwpColumnLayout{ + info: &info, + values: []byte{}, + arrayRowStart: []int32{0}, + arrayElems: []int32{0}, + nullBitmap: []byte{0x01}, + nonNullCount: 0, + } + batch := newSingleColumnBatch(info, layout, 1) + col := batch.Column(0) + + dst := []float64{99.0, 99.0} + got := col.Float64ArrayInto(0, dst) + if len(got) != 2 || got[0] != 99.0 || got[1] != 99.0 { + t.Fatalf("NULL row mutated dst = %v", got) + } +} + +// TestQwpColumnInt64ArrayInto mirrors the Float64ArrayInto test for +// LONG_ARRAY columns. +func TestQwpColumnInt64ArrayInto(t *testing.T) { + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeLongArray} + var buf bytes.Buffer + buf.WriteByte(1) + _ = binary.Write(&buf, binary.LittleEndian, int32(3)) + for _, v := range []int64{10, 20, 30} { + _ = binary.Write(&buf, binary.LittleEndian, v) + } + values := buf.Bytes() + + layout := qwpColumnLayout{ + info: &info, + values: values, + arrayRowStart: []int32{0}, + arrayElems: []int32{3}, + nonNullCount: 1, + } + batch := newSingleColumnBatch(info, layout, 1) + col := batch.Column(0) + + dst := col.Int64ArrayInto(0, nil) + want := []int64{10, 20, 30} + if len(dst) != len(want) { + t.Fatalf("Int64ArrayInto len = %d, want %d", len(dst), len(want)) + } + for i, w := range want { + if dst[i] != w { + t.Fatalf("Int64ArrayInto[%d] = %d, want %d", i, dst[i], w) + } + } +} + +// --- CopyAll --- + +// TestQwpColumnBatchCopyAllSurvivesPoolReuse is the contract CopyAll +// exists to satisfy: a snapshot taken from batch N remains valid and +// correct after batch N's pool-owned layout slices are reused for +// batch N+1. The live batch aliases the decoder's layout pool, so +// without the copy the snapshot's nonNullIdx / symbolRowIds / +// timestampBuf entries would read batch N+1 data. +func TestQwpColumnBatchCopyAllSurvivesPoolReuse(t *testing.T) { + // Build a nullable Int64 column so nonNullIdx is non-trivial and + // we can observe it getting overwritten. + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + rowBytes := [][]byte{ + binary.LittleEndian.AppendUint64(nil, uint64(100)), + nil, // NULL + binary.LittleEndian.AppendUint64(nil, uint64(300)), + } + layout := buildNullableLayout(&info, rowBytes) + batch := newSingleColumnBatch(info, layout, 3) + + snapshot := batch.CopyAll() + + // Simulate the decoder overwriting the pool-owned fields in place, + // the same way qwpColumnLayout.clear() + parseNullSection would. + for i := range batch.layouts[0].nonNullIdx { + batch.layouts[0].nonNullIdx[i] = 0xBAD + } + batch.layouts[0].values = []byte{0xDE, 0xAD, 0xBE, 0xEF, 0, 0, 0, 0} + + // Snapshot must still see the original values. + if got := snapshot.Int64(0, 0); got != 100 { + t.Fatalf("snapshot.Int64(0,0) = %d, want 100", got) + } + if !snapshot.IsNull(0, 1) { + t.Fatal("snapshot row 1 should be NULL") + } + if got := snapshot.Int64(0, 2); got != 300 { + t.Fatalf("snapshot.Int64(0,2) = %d, want 300", got) + } + if snapshot.RowCount() != 3 || snapshot.ColumnCount() != 1 { + t.Fatalf("snapshot row/col count = (%d, %d), want (3, 1)", + snapshot.RowCount(), snapshot.ColumnCount()) + } + if snapshot.ColumnName(0) != "v" { + t.Fatalf("snapshot column name = %q", snapshot.ColumnName(0)) + } +} + +// TestQwpColumnBatchCopyAllGorillaTimestampSurvivesPoolReuse covers +// the Gorilla-TIMESTAMP corner of CopyAll. For Gorilla-encoded +// columns the decoder sets layout.values to alias layout.timestampBuf +// (see parseTimestamp), so the snapshot must re-point values at the +// CLONED timestampBuf. Without that re-point, decoding a second frame +// into the same QwpColumnBatch overwrites the source's timestampBuf +// in place and the snapshot's Int64 accessor starts reading batch +// N+1 values. +func TestQwpColumnBatchCopyAllGorillaTimestampSurvivesPoolReuse(t *testing.T) { + // Small, regular DoDs push the encoder onto the Gorilla path; + // nonNullCount >= 3 is required for Gorilla (parseTimestamp + // rejects otherwise). + orig := []int64{1_000_000, 1_000_100, 1_000_200, 1_000_310, 1_000_520} + origRows := make([]func(*qwpColumnBuffer), len(orig)) + for i, v := range orig { + v := v + origRows[i] = func(c *qwpColumnBuffer) { c.addLong(v) } + } + frame1 := encodeSingleColumnBatch(t, "ts", qwpTypeTimestamp, false, origRows) + + // A second batch whose values are nowhere near the first, so a + // stale alias produces obviously-wrong reads rather than + // coincidentally-matching values. + fresh := []int64{5_000_000, 5_000_999, 5_001_888, 5_002_555, 5_003_333} + freshRows := make([]func(*qwpColumnBuffer), len(fresh)) + for i, v := range fresh { + v := v + freshRows[i] = func(c *qwpColumnBuffer) { c.addLong(v) } + } + frame2 := encodeSingleColumnBatch(t, "ts", qwpTypeTimestamp, false, freshRows) + + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame1, &batch); err != nil { + t.Fatalf("decode 1: %v", err) + } + // Precondition: the first decode must actually have taken the + // Gorilla path. If encoder heuristics change and this falls back + // to the uncompressed branch, the test no longer covers the bug. + if len(batch.layouts[0].timestampBuf) == 0 { + t.Fatal("test precondition: expected Gorilla path to populate timestampBuf") + } + + snapshot := batch.CopyAll() + + // Decode a second frame into the SAME batch. The decoder reuses + // batch.layouts[0].timestampBuf in place, so the source's backing + // array is now clobbered. + if err := dec.decode(frame2, &batch); err != nil { + t.Fatalf("decode 2: %v", err) + } + + for i, w := range orig { + if got := snapshot.Int64(0, i); got != w { + t.Fatalf("snapshot.Int64(0, %d) = %d, want %d", i, got, w) + } + } +} + +// TestQwpColumnBatchCopyAllRawSurvivesPayloadReuse covers the raw +// (non-zstd) sibling of TestQwpColumnBatchCopyAllZstdSurvivesPoolReuse. +// The egress I/O loop reads each WS frame into a buffer borrowed from +// qwpEgressIO.readBufPool; on the raw path the decoded batch's column +// slices (values, stringBytes, nullBitmap) alias that pooled buffer +// directly. releaseBuffer returns the buffer to the pool, and the next +// inbound frame is decoded into the same backing array in place. A +// CopyAll result the caller retained from the released batch must +// remain valid across that recycle — i.e. CopyAll must deep-clone the +// payload bytes on the raw path the same way it already does on the +// zstd path. +// +// Reproduces the in-place clobber without touching the I/O loop: +// allocate one backing array, write frame 1 into it, hand the slice to +// the decoder, snapshot, then overwrite the array's bytes with frame 2. +// snapshot.Int64 reads its values from the same backing array the +// decoder aliased; without the fix the post-clobber read returns the +// frame-2 little-endian word at that offset, not the original. +func TestQwpColumnBatchCopyAllRawSurvivesPayloadReuse(t *testing.T) { + frame1 := encodeSingleColumnBatch(t, "v", qwpTypeLong, false, + []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addLong(111) }, + func(c *qwpColumnBuffer) { c.addLong(222) }, + }) + frame2 := encodeSingleColumnBatch(t, "v", qwpTypeLong, false, + []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addLong(-9999) }, + func(c *qwpColumnBuffer) { c.addLong(-8888) }, + }) + if len(frame2) < len(frame1) { + t.Fatalf("test precondition: frame2 (%d) must be >= frame1 (%d) so the clobber overlaps the column data", len(frame2), len(frame1)) + } + + // One backing array that stands in for a recycled readBufPool + // buffer: it holds frame1 first, then the next frame is read into + // the same memory in place. + pooled := make([]byte, len(frame2)) + copy(pooled, frame1) + payload := pooled[:len(frame1)] + + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(payload, &b); err != nil { + t.Fatalf("decode 1: %v", err) + } + if len(b.zstdScratch) != 0 { + t.Fatalf("test precondition: expected raw (non-zstd) path; zstdScratch=%d", len(b.zstdScratch)) + } + + snapshot := b.CopyAll() + if got := snapshot.Int64(0, 0); got != 111 { + t.Fatalf("pre-clobber snapshot.Int64(0,0) = %d, want 111", got) + } + if got := snapshot.Int64(0, 1); got != 222 { + t.Fatalf("pre-clobber snapshot.Int64(0,1) = %d, want 222", got) + } + + // Recycle: the I/O loop hands the buffer back to readBufPool and + // the reader's qwpReadFrameInto writes the next frame into the + // same backing array. Simulate that with a copy(). + copy(pooled, frame2) + + // Snapshot must still report frame-1 values. + if got := snapshot.Int64(0, 0); got != 111 { + t.Fatalf("post-clobber snapshot.Int64(0,0) = %d, want 111 (CopyAll didn't clone the raw payload)", got) + } + if got := snapshot.Int64(0, 1); got != 222 { + t.Fatalf("post-clobber snapshot.Int64(0,1) = %d, want 222 (CopyAll didn't clone the raw payload)", got) + } +} + +// buildDecimalGeohashFrame produces a one-row RESULT_BATCH frame with +// a DECIMAL64 column (given scale) and a GEOHASH column (given precision +// bits). The decoder reads the per-batch scale / precision off the DATA +// section and stores them on qwpColumnLayout, which is what the race +// test below observes concurrently. +func buildDecimalGeohashFrame(t *testing.T, scale uint32, precision int8, unscaled int64) []byte { + t.Helper() + tb := newQwpTableBuffer("t") + dcol, err := tb.getOrCreateColumn("d", qwpTypeDecimal64, false) + if err != nil { + t.Fatalf("getOrCreateColumn d: %v", err) + } + if err := dcol.addDecimal(NewDecimalFromInt64(unscaled, scale)); err != nil { + t.Fatalf("addDecimal: %v", err) + } + gcol, err := tb.getOrCreateColumn("g", qwpTypeGeohash, false) + if err != nil { + t.Fatalf("getOrCreateColumn g: %v", err) + } + if err := gcol.addGeohash(uint64(unscaled), precision); err != nil { + t.Fatalf("addGeohash: %v", err) + } + tb.commitRow() + var enc qwpEncoder + ingress := enc.encodeTable(tb) + return wrapAsResultBatch(ingress, 1, 0) +} + +// TestQwpColumnBatchCopyAllScaleAndPrecisionAreRaceFree exercises the +// concurrency invariant that commit 58e1915 ("Fix data race on decimal +// scale and geohash precision") added: a held CopyAll snapshot +// must be safe to read while the decoder writes the next batch's scale +// / precision into the source QwpColumnBatch. +// +// Before that fix both fields lived on the connection-scoped +// qwpColumnSchemaInfo, which the decoder mutated per batch and which +// every snapshot aliased via layouts[i].info — so this test paired +// with `go test -race` flagged the write/read overlap. Post-fix the +// fields are on qwpColumnLayout and CopyAll takes value copies, so the +// snapshot's accessors read memory the decoder never touches again. +// +// Without -race this test is still meaningful: a snapshot must keep +// its frame-A values even after frame B is decoded into the source +// batch. +func TestQwpColumnBatchCopyAllScaleAndPrecisionAreRaceFree(t *testing.T) { + frameA := buildDecimalGeohashFrame(t, 2, 20, 12345) + frameB := buildDecimalGeohashFrame(t, 7, 40, 99999) + + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frameA, &batch); err != nil { + t.Fatalf("decode A: %v", err) + } + if s := batch.DecimalScale(0); s != 2 { + t.Fatalf("A scale = %d, want 2", s) + } + if p := batch.GeohashPrecisionBits(1); p != 20 { + t.Fatalf("A precision = %d, want 20", p) + } + + snapshot := batch.CopyAll() + + const readers = 4 + var wg sync.WaitGroup + stop := make(chan struct{}) + for r := 0; r < readers; r++ { + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-stop: + return + default: + } + if s := snapshot.DecimalScale(0); s != 2 { + t.Errorf("snapshot.DecimalScale = %d, want 2", s) + return + } + if p := snapshot.GeohashPrecisionBits(1); p != 20 { + t.Errorf("snapshot.GeohashPrecisionBits = %d, want 20", p) + return + } + } + }() + } + + // Repeatedly re-decode frame B into the same batch. Each decode + // writes frame-B scale / precision into the layout; -race catches + // any overlap with the readers above. + for i := 0; i < 200; i++ { + if err := dec.decode(frameB, &batch); err != nil { + close(stop) + wg.Wait() + t.Fatalf("decode B [%d]: %v", i, err) + } + if s := batch.DecimalScale(0); s != 7 { + close(stop) + wg.Wait() + t.Fatalf("live batch scale = %d, want 7", s) + } + if p := batch.GeohashPrecisionBits(1); p != 40 { + close(stop) + wg.Wait() + t.Fatalf("live batch precision = %d, want 40", p) + } + } + + close(stop) + wg.Wait() +} + +// --- Column handle --- + +// TestQwpColumnHandleMirrorsBatchAccessors asserts the captured column +// handle returns the same values as the batch-level (col, row) +// accessors for every fixed-width type, including NULL rows. +func TestQwpColumnHandleMirrorsBatchAccessors(t *testing.T) { + // Nullable Int64 column: 5 rows (V N V V N), values 100/300/400. + intInfo := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + rowBytes := [][]byte{ + binary.LittleEndian.AppendUint64(nil, 100), + nil, + binary.LittleEndian.AppendUint64(nil, 300), + binary.LittleEndian.AppendUint64(nil, 400), + nil, + } + intLayout := buildNullableLayout(&intInfo, rowBytes) + + // VARCHAR column: 3 rows, no nulls. + strInfo := qwpColumnSchemaInfo{name: "s", wireType: qwpTypeVarchar} + strLayout := buildStringLayout(&strInfo, []string{"foo", "bar", "baz"}) + + // Build a two-column batch manually (same rowCount across columns + // isn't a hard invariant here — the string accessor only indexes + // into its own column's values/offsets). + batch := &QwpColumnBatch{ + requestId: 1, + rowCount: 5, + columnCount: 2, + columns: []qwpColumnSchemaInfo{intInfo, strInfo}, + layouts: []qwpColumnLayout{intLayout, strLayout}, + } + + icol := batch.Column(0) + if icol.Name() != "v" { + t.Fatalf("Name = %q", icol.Name()) + } + if icol.Type() != byte(qwpTypeLong) { + t.Fatalf("Type = %#x", icol.Type()) + } + if icol.RowCount() != 5 { + t.Fatalf("RowCount = %d", icol.RowCount()) + } + if icol.NonNullCount() != 3 { + t.Fatalf("NonNullCount = %d", icol.NonNullCount()) + } + if !icol.HasNulls() { + t.Fatal("HasNulls should be true for nullable column") + } + for row := 0; row < 5; row++ { + if icol.IsNull(row) != batch.IsNull(0, row) { + t.Fatalf("IsNull mismatch at %d", row) + } + if icol.Int64(row) != batch.Int64(0, row) { + t.Fatalf("Int64 mismatch at %d: col=%d batch=%d", + row, icol.Int64(row), batch.Int64(0, row)) + } + } + + scol := batch.Column(1) + if scol.HasNulls() { + t.Fatal("HasNulls should be false for non-nullable column") + } + for row, want := range []string{"foo", "bar", "baz"} { + if got := scol.String(row); got != want { + t.Fatalf("String(%d) = %q, want %q", row, got, want) + } + if !bytes.Equal(scol.Str(row), []byte(want)) { + t.Fatalf("Str(%d) mismatch", row) + } + } +} + +// --- Bulk range accessors --- + +func TestQwpColumnRangeNoNulls(t *testing.T) { + intInfo := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + // 6 rows of 8 bytes, values 10..60 step 10. + values := make([]byte, 48) + for i := 0; i < 6; i++ { + binary.LittleEndian.PutUint64(values[i*8:], uint64((i+1)*10)) + } + layout := buildFixedLayout(&intInfo, values, 6) + batch := newSingleColumnBatch(intInfo, layout, 6) + + col := batch.Column(0) + got := col.Int64Range(1, 5, nil) + want := []int64{20, 30, 40, 50} + if len(got) != len(want) { + t.Fatalf("len = %d, want %d", len(got), len(want)) + } + for i, w := range want { + if got[i] != w { + t.Fatalf("Int64Range[%d] = %d, want %d", i, got[i], w) + } + } + + // Empty / reversed ranges return dst unchanged. + if out := col.Int64Range(3, 3, []int64{7}); len(out) != 1 || out[0] != 7 { + t.Fatalf("empty range altered dst: %v", out) + } + if out := col.Int64Range(5, 2, nil); len(out) != 0 { + t.Fatalf("reversed range should return empty, got %v", out) + } + + // Append into a prealloc'd buffer: no realloc should happen. + dst := make([]int64, 0, 6) + dst = col.Int64Range(0, 6, dst) + if cap(dst) != 6 { + t.Fatalf("cap grew unexpectedly: %d", cap(dst)) + } + for i, w := range []int64{10, 20, 30, 40, 50, 60} { + if dst[i] != w { + t.Fatalf("full range [%d] = %d, want %d", i, dst[i], w) + } + } +} + +func TestQwpColumnInt64RangeWithNulls(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + rowBytes := [][]byte{ + binary.LittleEndian.AppendUint64(nil, 100), + nil, + binary.LittleEndian.AppendUint64(nil, 300), + binary.LittleEndian.AppendUint64(nil, 400), + nil, + } + layout := buildNullableLayout(&info, rowBytes) + batch := newSingleColumnBatch(info, layout, 5) + + col := batch.Column(0) + dst := col.Int64Range(0, 5, nil) + // NULL rows become 0 (matching the per-cell Int64 accessor). + want := []int64{100, 0, 300, 400, 0} + for i, w := range want { + if dst[i] != w { + t.Fatalf("Int64Range[%d] = %d, want %d", i, dst[i], w) + } + } +} + +func TestQwpColumnFloat64Range(t *testing.T) { + info := qwpColumnSchemaInfo{name: "d", wireType: qwpTypeDouble} + values := make([]byte, 24) + binary.LittleEndian.PutUint64(values[0:], math.Float64bits(1.1)) + binary.LittleEndian.PutUint64(values[8:], math.Float64bits(2.2)) + binary.LittleEndian.PutUint64(values[16:], math.Float64bits(3.3)) + layout := buildFixedLayout(&info, values, 3) + batch := newSingleColumnBatch(info, layout, 3) + + col := batch.Column(0) + dst := col.Float64Range(0, 3, nil) + want := []float64{1.1, 2.2, 3.3} + for i, w := range want { + if dst[i] != w { + t.Fatalf("Float64Range[%d] = %v, want %v", i, dst[i], w) + } + } +} + +func TestQwpColumnInt32Range(t *testing.T) { + info := qwpColumnSchemaInfo{name: "i", wireType: qwpTypeInt} + values := make([]byte, 16) + for i := 0; i < 4; i++ { + binary.LittleEndian.PutUint32(values[i*4:], uint32(i*111)) + } + layout := buildFixedLayout(&info, values, 4) + batch := newSingleColumnBatch(info, layout, 4) + + col := batch.Column(0) + dst := col.Int32Range(1, 4, nil) + want := []int32{111, 222, 333} + for i, w := range want { + if dst[i] != w { + t.Fatalf("Int32Range[%d] = %d, want %d", i, dst[i], w) + } + } +} + +func TestQwpColumnFloat32Range(t *testing.T) { + info := qwpColumnSchemaInfo{name: "f", wireType: qwpTypeFloat} + values := make([]byte, 12) + binary.LittleEndian.PutUint32(values[0:], math.Float32bits(1.5)) + binary.LittleEndian.PutUint32(values[4:], math.Float32bits(-2.5)) + binary.LittleEndian.PutUint32(values[8:], math.Float32bits(3.25)) + layout := buildFixedLayout(&info, values, 3) + batch := newSingleColumnBatch(info, layout, 3) + + col := batch.Column(0) + dst := col.Float32Range(0, 3, nil) + want := []float32{1.5, -2.5, 3.25} + for i, w := range want { + if dst[i] != w { + t.Fatalf("Float32Range[%d] = %v, want %v", i, dst[i], w) + } + } +} + +// TestQwpColumnRangeOOBPanicsInNoNullsPath pins the safety contract +// of the no-nulls fast path: misuse with toRow > rowCount must panic +// the same way the per-cell accessor does, instead of silently reading +// past the values buffer via unsafe.Slice. +func TestQwpColumnRangeOOBPanicsInNoNullsPath(t *testing.T) { + cases := []struct { + name string + wireType qwpTypeCode + rowBytes int + run func(col QwpColumn) + }{ + {"Int64Range", qwpTypeLong, 8, func(col QwpColumn) { col.Int64Range(0, 5, nil) }}, + {"Float64Range", qwpTypeDouble, 8, func(col QwpColumn) { col.Float64Range(0, 5, nil) }}, + {"Int32Range", qwpTypeInt, 4, func(col QwpColumn) { col.Int32Range(0, 5, nil) }}, + {"Float32Range", qwpTypeFloat, 4, func(col QwpColumn) { col.Float32Range(0, 5, nil) }}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: tc.wireType} + values := make([]byte, 2*tc.rowBytes) // exactly 2 rows wide + layout := buildFixedLayout(&info, values, 2) + batch := newSingleColumnBatch(info, layout, 2) + col := batch.Column(0) + + defer func() { + if r := recover(); r == nil { + t.Fatalf("%s: expected panic for toRow > rowCount, got none", tc.name) + } + }() + tc.run(col) + }) + } +} + +// TestQwpColumnRangeZeroAllocWhenPrealloc asserts Range accessors +// don't allocate when dst has sufficient capacity — the intended usage +// pattern for steady-state row sweeps. +func TestQwpColumnRangeZeroAllocWhenPrealloc(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + values := make([]byte, 8*100) + for i := 0; i < 100; i++ { + binary.LittleEndian.PutUint64(values[i*8:], uint64(i)) + } + layout := buildFixedLayout(&info, values, 100) + batch := newSingleColumnBatch(info, layout, 100) + + col := batch.Column(0) + buf := make([]int64, 0, 100) + allocs := testing.AllocsPerRun(100, func() { + buf = buf[:0] + buf = col.Int64Range(0, 100, buf) + }) + if allocs != 0 { + t.Fatalf("Int64Range with prealloc dst allocated %v/run, want 0", allocs) + } +} + +// --- Zero-alloc contract --- + +func TestQwpColumnBatchZeroAlloc(t *testing.T) { + // The Int64, Float64, and Str accessors must not allocate on the + // hot path. Str allocates only when crossing into String (the + // materialising helper) — we exclude that here. + intInfo := qwpColumnSchemaInfo{name: "i", wireType: qwpTypeLong} + intValues := make([]byte, 8) + binary.LittleEndian.PutUint64(intValues, 42) + intLayout := buildFixedLayout(&intInfo, intValues, 1) + + strInfo := qwpColumnSchemaInfo{name: "s", wireType: qwpTypeVarchar} + strLayout := buildStringLayout(&strInfo, []string{"hello"}) + + batch := &QwpColumnBatch{ + requestId: 1, + rowCount: 1, + columnCount: 2, + columns: []qwpColumnSchemaInfo{intInfo, strInfo}, + layouts: []qwpColumnLayout{intLayout, strLayout}, + } + + allocs := testing.AllocsPerRun(100, func() { + _ = batch.Int64(0, 0) + _ = batch.Str(1, 0) + _ = batch.IsNull(0, 0) + _ = batch.NonNullCount(0) + }) + if allocs != 0 { + t.Fatalf("hot-path accessors allocated %v times/run, want 0", allocs) + } +} + +// --- Mis-typed / out-of-bounds accessor contract --- + +// assertPanics runs fn, fails if it does not panic, and (when wantSubstr +// is non-empty) fails if the recovered value's string form does not +// contain wantSubstr. Pinning the substring stops a test from passing on +// an unrelated panic (e.g. a nil deref) instead of the guard it targets. +func assertPanics(t *testing.T, wantSubstr string, fn func()) { + t.Helper() + defer func() { + r := recover() + if r == nil { + t.Fatalf("expected panic containing %q, got none", wantSubstr) + } + if msg := fmt.Sprintf("%v", r); wantSubstr != "" && !strings.Contains(msg, wantSubstr) { + t.Fatalf("panic %q does not contain %q", msg, wantSubstr) + } + }() + fn() +} + +// TestQwpColumnRangeTypeMismatchPanics pins the *Range width guard: a +// Range accessor on a column whose wire type is not the matching fixed +// width panics with a typed message (carrying the column name + wire +// type) rather than the opaque slice-bounds panic the unguarded memmove +// path produced — most visibly Int64Range on a bit-packed BOOLEAN, +// whose dense region is far shorter than toRow*8. +func TestQwpColumnRangeTypeMismatchPanics(t *testing.T) { + mkCol := func(wt qwpTypeCode, rows int) QwpColumn { + info := qwpColumnSchemaInfo{name: "c", wireType: wt} + // 8 bytes/row of backing storage regardless of the column's real + // width, so a too-narrow read could *silently* succeed without + // the guard. That proves it is the guard, not an incidental OOB, + // that fires. + layout := buildFixedLayout(&info, make([]byte, rows*8), rows) + return newSingleColumnBatch(info, layout, rows).Column(0) + } + for _, tc := range []struct { + name string + wt qwpTypeCode + run func(c QwpColumn) + }{ + {"Int64Range/BOOLEAN", qwpTypeBoolean, func(c QwpColumn) { c.Int64Range(0, 4, nil) }}, + {"Int64Range/INT", qwpTypeInt, func(c QwpColumn) { c.Int64Range(0, 4, nil) }}, + {"Int64Range/SYMBOL", qwpTypeSymbol, func(c QwpColumn) { c.Int64Range(0, 4, nil) }}, + {"Float64Range/FLOAT", qwpTypeFloat, func(c QwpColumn) { c.Float64Range(0, 4, nil) }}, + {"Int32Range/LONG", qwpTypeLong, func(c QwpColumn) { c.Int32Range(0, 4, nil) }}, + {"Int32Range/BOOLEAN", qwpTypeBoolean, func(c QwpColumn) { c.Int32Range(0, 4, nil) }}, + {"Float32Range/DOUBLE", qwpTypeDouble, func(c QwpColumn) { c.Float32Range(0, 4, nil) }}, + } { + t.Run(tc.name, func(t *testing.T) { + c := mkCol(tc.wt, 4) + assertPanics(t, "fixed-width", func() { tc.run(c) }) + }) + } +} + +// TestQwpColumnRangeSameWidthReinterpretAllowed pins the deliberately +// permitted case: a Range accessor on a different type of the SAME +// element width passes the guard and reinterprets the raw bits, so the +// documented "numeric noise" contract for Int64Range on a DOUBLE column +// still holds. +func TestQwpColumnRangeSameWidthReinterpretAllowed(t *testing.T) { + info := qwpColumnSchemaInfo{name: "d", wireType: qwpTypeDouble} + values := make([]byte, 16) + binary.LittleEndian.PutUint64(values[0:], math.Float64bits(1.5)) + binary.LittleEndian.PutUint64(values[8:], math.Float64bits(2.5)) + layout := buildFixedLayout(&info, values, 2) + col := newSingleColumnBatch(info, layout, 2).Column(0) + + got := col.Int64Range(0, 2, nil) // 8-byte DOUBLE read as int64: allowed + if len(got) != 2 || + uint64(got[0]) != math.Float64bits(1.5) || + uint64(got[1]) != math.Float64bits(2.5) { + t.Fatalf("Int64Range on DOUBLE = %v, want raw float64 bits", got) + } +} + +// TestQwpArrayAccessorsOnNonArrayPanic pins the array-type guard: every +// array accessor, on both the QwpColumnBatch and QwpColumn surfaces, +// panics with a typed message when the column is not an array — instead +// of the opaque "index out of range [n] with length 0" from indexing +// the empty arrayRowStart side table. +func TestQwpArrayAccessorsOnNonArrayPanic(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + values := make([]byte, 8) + binary.LittleEndian.PutUint64(values, 42) + layout := buildFixedLayout(&info, values, 1) + batch := newSingleColumnBatch(info, layout, 1) + col := batch.Column(0) + + for _, tc := range []struct { + name string + run func() + }{ + {"batch.Float64Array", func() { batch.Float64Array(0, 0) }}, + {"batch.Int64Array", func() { batch.Int64Array(0, 0) }}, + {"batch.ArrayNDims", func() { batch.ArrayNDims(0, 0) }}, + {"batch.ArrayDim", func() { batch.ArrayDim(0, 0, 0) }}, + {"col.Float64Array", func() { col.Float64Array(0) }}, + {"col.Int64Array", func() { col.Int64Array(0) }}, + {"col.ArrayNDims", func() { col.ArrayNDims(0) }}, + {"col.ArrayDim", func() { col.ArrayDim(0, 0) }}, + {"col.Float64ArrayInto", func() { col.Float64ArrayInto(0, nil) }}, + {"col.Int64ArrayInto", func() { col.Int64ArrayInto(0, nil) }}, + } { + t.Run(tc.name, func(t *testing.T) { + assertPanics(t, "not an array type", tc.run) + }) + } +} + +// TestQwpArrayElementTypeReinterpretAllowed pins the permitted same-width +// reinterpretation across the two array element types: the guard checks +// "is an array", not "is THIS array type", so Int64Array on a +// DOUBLE_ARRAY column decodes the 8-byte elements as raw int64 bits +// rather than panicking — the array analogue of the *Range reinterpret. +func TestQwpArrayElementTypeReinterpretAllowed(t *testing.T) { + info := qwpColumnSchemaInfo{name: "a", wireType: qwpTypeDoubleArray} + var buf bytes.Buffer + buf.WriteByte(1) // nDims + _ = binary.Write(&buf, binary.LittleEndian, int32(2)) + _ = binary.Write(&buf, binary.LittleEndian, 1.5) + _ = binary.Write(&buf, binary.LittleEndian, 2.5) + layout := qwpColumnLayout{ + info: &info, + values: buf.Bytes(), + arrayRowStart: []int32{0}, + arrayElems: []int32{2}, + nonNullCount: 1, + } + batch := newSingleColumnBatch(info, layout, 1) + got := batch.Int64Array(0, 0) // must not panic + if len(got) != 2 || + uint64(got[0]) != math.Float64bits(1.5) || + uint64(got[1]) != math.Float64bits(2.5) { + t.Fatalf("Int64Array on DOUBLE_ARRAY = %v, want raw float64 bits", got) + } +} + +// TestQwpColumnBatchPerCellMistypeAndOOB characterises the per-cell +// fixed-width accessors under misuse — the behavior the package +// documents as "undefined" but which must never silently read out of +// bounds. Two regimes: a same-width mis-type reinterprets the bytes (no +// panic); a too-narrow column or an out-of-range row slices past the +// dense values region and surfaces Go's bounds-check panic rather than +// returning adjacent memory. Pinned so a future "optimisation" to +// unsafe per-cell indexing that drops the bounds check is caught. +func TestQwpColumnBatchPerCellMistypeAndOOB(t *testing.T) { + t.Run("same_width_reinterpret_no_panic", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "d", wireType: qwpTypeDouble} + values := make([]byte, 8) + binary.LittleEndian.PutUint64(values, math.Float64bits(1.5)) + layout := buildFixedLayout(&info, values, 1) + batch := newSingleColumnBatch(info, layout, 1) + if got := batch.Int64(0, 0); uint64(got) != math.Float64bits(1.5) { + t.Fatalf("Int64 on DOUBLE = %#x, want float64 bits %#x", + uint64(got), math.Float64bits(1.5)) + } + }) + + t.Run("too_narrow_type_panics", func(t *testing.T) { + // BYTE column: 1 byte/value, so an 8-byte Int64 read slices past + // the 2-byte dense region. + info := qwpColumnSchemaInfo{name: "b", wireType: qwpTypeByte} + layout := buildFixedLayout(&info, []byte{0x01, 0x02}, 2) + batch := newSingleColumnBatch(info, layout, 2) + assertPanics(t, "", func() { _ = batch.Int64(0, 0) }) + }) + + t.Run("oob_row_no_nulls_panics", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + layout := buildFixedLayout(&info, make([]byte, 8), 1) + batch := newSingleColumnBatch(info, layout, 1) + assertPanics(t, "", func() { _ = batch.Int64(0, 5) }) + }) + + t.Run("oob_row_nullable_panics", func(t *testing.T) { + info := qwpColumnSchemaInfo{name: "v", wireType: qwpTypeLong} + rowBytes := [][]byte{ + binary.LittleEndian.AppendUint64(nil, 100), + nil, + } + layout := buildNullableLayout(&info, rowBytes) + batch := newSingleColumnBatch(info, layout, 2) + assertPanics(t, "", func() { _ = batch.Int64(0, 99) }) + }) +} + +// --- CopyAll: symbol dict + array metadata --- + +// TestQwpColumnBatchCopyAllSymbolSurvivesPoolReuse covers the SYMBOL +// corner of CopyAll. A snapshot must keep resolving its rows to the +// right strings after the decoder (a) reuses the batch's pool-owned +// symbolRowIds for the next frame and (b) append-grows the +// connection-scoped dict. CopyAll clones symbolRowIds and snapshots the +// append-only dict view, so both survive. +func TestQwpColumnBatchCopyAllSymbolSurvivesPoolReuse(t *testing.T) { + globalDict := []string{"alpha", "beta", "gamma", "delta", "epsilon"} + + // frame1 (batch_seq 0): rows alpha,beta,alpha (ids 0,1,0); advertises + // dict ids 0..1. + tb1 := newQwpTableBuffer("t") + for _, id := range []int32{0, 1, 0} { + col, _ := tb1.getOrCreateColumn("s", qwpTypeSymbol, false) + col.addSymbolID(id) + tb1.commitRow() + } + var enc qwpEncoder + frame1 := wrapAsResultBatch(enc.encodeTableWithDeltaDict(tb1, globalDict, -1, 1), 1, 0) + + // frame2 (continuation, batch_seq 1): rows beta,epsilon (ids 1,4). + // Row 0's id differs from frame1's, so the snapshot reading "alpha" + // at row 0 proves it walks its own cloned symbolRowIds rather than + // the reused pool slice. Advertising ids 2..4 append-grows the dict + // heap past frame1's frozen prefix. + tb2 := newQwpTableBuffer("t") + for _, id := range []int32{1, 4} { + col, _ := tb2.getOrCreateColumn("s", qwpTypeSymbol, false) + col.addSymbolID(id) + tb2.commitRow() + } + frame2 := wrapAsResultBatch(enc.encodeTableWithDeltaDict(tb2, globalDict, 1, 4), 1, 1) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(frame1, &b); err != nil { + t.Fatalf("decode 1: %v", err) + } + want := []string{"alpha", "beta", "alpha"} + for i, w := range want { + if got := b.String(0, i); got != w { + t.Fatalf("live batch1 row %d = %q, want %q", i, got, w) + } + } + + snapshot := b.CopyAll() + + // Decode the continuation into the SAME batch: reuses b's pool-owned + // symbolRowIds in place and append-extends the decoder's dict. + if err := dec.decode(frame2, &b); err != nil { + t.Fatalf("decode 2: %v", err) + } + if got := b.String(0, 0); got != "beta" { + t.Fatalf("live batch2 row 0 = %q, want %q", got, "beta") + } + if got := b.String(0, 1); got != "epsilon" { + t.Fatalf("live batch2 row 1 = %q, want %q", got, "epsilon") + } + + // Snapshot must still resolve frame1's per-row symbols. + for i, w := range want { + if got := snapshot.String(0, i); got != w { + t.Fatalf("snapshot row %d = %q, want %q (CopyAll didn't snapshot symbol state)", i, got, w) + } + } +} + +// TestQwpColumnBatchCopyAllArraySurvivesPoolReuse covers the ARRAY +// corner of CopyAll: a snapshot must keep its shape + elements after the +// decoder reuses the batch for the next frame. That clobbers two kinds +// of state at once — the pool-owned arrayRowStart / arrayElems side +// tables (overwritten in place) and the array bytes in `values` (which +// alias the recycled payload buffer). CopyAll clones the side tables and +// rebinds values onto a private payload clone. +func TestQwpColumnBatchCopyAllArraySurvivesPoolReuse(t *testing.T) { + // frame1: two 1-D DOUBLE_ARRAY rows of different lengths, so the + // arrayRowStart / arrayElems side tables carry distinct per-row values. + frame1 := encodeSingleColumnBatch(t, "a", qwpTypeDoubleArray, false, + []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addDoubleArray(1, []int32{3}, []float64{1.5, 2.5, 3.5}) }, + func(c *qwpColumnBuffer) { c.addDoubleArray(1, []int32{2}, []float64{4.5, 5.5}) }, + }) + // frame2: different shapes and a larger byte footprint, so writing it + // into the recycled buffer fully overwrites frame1's bytes and the + // re-decode rewrites arrayRowStart / arrayElems with new values. + frame2 := encodeSingleColumnBatch(t, "a", qwpTypeDoubleArray, false, + []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addDoubleArray(1, []int32{5}, []float64{-1, -2, -3, -4, -5}) }, + func(c *qwpColumnBuffer) { c.addDoubleArray(1, []int32{4}, []float64{-6, -7, -8, -9}) }, + }) + if len(frame2) < len(frame1) { + t.Fatalf("precondition: frame2 (%d) must be >= frame1 (%d)", len(frame2), len(frame1)) + } + + // One backing array recycled across two decodes, standing in for the + // egress I/O loop's readBufPool buffer. + pooled := make([]byte, len(frame2)) + copy(pooled, frame1) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(pooled[:len(frame1)], &b); err != nil { + t.Fatalf("decode 1: %v", err) + } + if len(b.zstdScratch) != 0 { + t.Fatalf("precondition: expected raw (non-zstd) path; zstdScratch=%d", len(b.zstdScratch)) + } + + snapshot := b.CopyAll() + + assertArrayRow := func(label string, row int, wantDim int, want []float64) { + t.Helper() + if n := snapshot.ArrayNDims(0, row); n != 1 { + t.Fatalf("%s: snapshot ArrayNDims(row %d) = %d, want 1", label, row, n) + } + if d := snapshot.ArrayDim(0, row, 0); d != wantDim { + t.Fatalf("%s: snapshot ArrayDim(row %d) = %d, want %d", label, row, d, wantDim) + } + got := snapshot.Float64Array(0, row) + if len(got) != len(want) { + t.Fatalf("%s: snapshot Float64Array(row %d) len = %d, want %d", label, row, len(got), len(want)) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("%s: snapshot Float64Array(row %d)[%d] = %v, want %v", label, row, i, got[i], want[i]) + } + } + } + + assertArrayRow("pre-clobber", 0, 3, []float64{1.5, 2.5, 3.5}) + assertArrayRow("pre-clobber", 1, 2, []float64{4.5, 5.5}) + + // Recycle the buffer and re-decode into the SAME batch: overwrites the + // payload bytes the live batch aliased and rewrites its arrayRowStart + // / arrayElems in place. + copy(pooled, frame2) + if err := dec.decode(pooled[:len(frame2)], &b); err != nil { + t.Fatalf("decode 2: %v", err) + } + if d := b.ArrayDim(0, 0, 0); d != 5 { + t.Fatalf("live batch2 ArrayDim(row 0) = %d, want 5", d) + } + + // The snapshot keeps frame1's shape + elements. + assertArrayRow("post-clobber", 0, 3, []float64{1.5, 2.5, 3.5}) + assertArrayRow("post-clobber", 1, 2, []float64{4.5, 5.5}) +} diff --git a/qwp_query_client.go b/qwp_query_client.go new file mode 100644 index 00000000..4b9335c6 --- /dev/null +++ b/qwp_query_client.go @@ -0,0 +1,1260 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "encoding/base64" + "errors" + "fmt" + "iter" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/klauspost/compress/zstd" +) + +// qwpQueryCleanupDrainTimeout bounds the drain that happens on +// close-path cleanup (QwpQuery.Close, iterator break-out, Exec-on- +// SELECT misuse). Deliberately independent of the caller's context so +// the dispatcher returns to idle and the client stays usable for a +// follow-up Query/Exec even when the caller's ctx has already expired +// by the time cleanup runs. 5s matches the Java client's +// shutdownJoinMs default. +const qwpQueryCleanupDrainTimeout = 5 * time.Second + +// QwpQueryClient is a QuestDB query-side (egress) client. It opens one +// WebSocket connection to /read/v1, runs a dedicated I/O goroutine +// pair (reader + dispatcher), and streams result batches to the caller +// via Query/Exec. The I/O goroutines read and decode ahead of the +// consumer up to the configured buffer-pool depth. +// +// Thread safety: not safe for concurrent Query or Exec calls on the +// same client. Open one client per query-issuing goroutine. Cancel +// (on the returned *QwpQuery) and Close are safe to call from other +// goroutines. +type QwpQueryClient struct { + cfg *qwpQueryClientConfig + + // transportPtr and ioPtr are atomically replaced by the failover + // orchestrator on reconnect. The session reads through the + // transport() / io() accessors so a swap mid-Query is observed + // as a clean generation boundary. Both pointers are set during + // construction (newQwpQueryClient) and never nil while the + // client is live. + transportPtr atomic.Pointer[qwpTransport] + ioPtr atomic.Pointer[qwpEgressIO] + + // genMu serialises generation lifecycle transitions: the + // destroy-old / build-new pair in reconnectAndReplay, and Close's + // set-closed + snapshot of the bound (transport, io) pair. nextEvent + // reads the atomic pointers under no lock; reconnect and Close grab + // this mutex so a transport fault cannot publish a fresh generation + // that a concurrent Close would never observe (and so leak forever), + // and so Close always tears down a consistent generation pair rather + // than a torn read straddling publishGeneration. Held only across the + // reconnect critical section and Close's flag-set+snapshot — never + // across a user-facing wait, since the I/O shutdown in both runs + // after the mutex is released. + genMu sync.Mutex + + // hostTracker is the failover.md §2 host-health / zone tracker + // shared by the initial connect and every failover reconnect. It + // drives endpoint selection via the (state, zone) priority lattice + // — the `zone=` locality hint is effective here (the SF ingress + // tracker is zone-blind by contrast). Constructed once in + // newQwpQueryClient and never replaced; its state (sticky-Healthy, + // topology classifications) deliberately persists across + // reconnects for the client's lifetime. Thread-safe internally. + hostTracker *qwpHostTracker + + // currentEndpointIdx tracks the index in cfg.endpoints currently + // bound. -1 before construction completes, set by connectWalk and + // updated by reconnectAndReplay. Read by the failover orchestrator + // to feed RecordMidStreamFailure with the just-failed index before + // the reconnect walk. + currentEndpointIdx atomic.Int32 + // serverInfo holds the SERVER_INFO from the bound generation. + // Nil when it was not consumed (serverInfoTimeout disabled or no + // parseable frame). Written by connectWalk and reconnectAndReplay; + // read via the public ServerInfo() accessor. + serverInfo atomic.Pointer[QwpServerInfo] + + // nextRequestId is the monotonic client-assigned request id + // handed to the I/O goroutine on each submit. Assigned from the + // user goroutine inside Query/Exec; not accessed from other + // goroutines (one query at a time). + nextRequestId int64 + + // binds is the reusable typed bind-parameter sink. Populated on + // the user goroutine by the QwpBindFunc passed to Query / Exec. + // buildRequest copies the encoded bytes into a fresh per-request + // slice before handing the request to the I/O goroutine, so a + // follow-up query's reset + re-encode cannot race the dispatcher. + binds QwpBinds + + // closed guards Close against double-close and later Query/Exec. + closed atomic.Bool + // closeOnce ensures the teardown side effects (I/O shutdown, + // transport close) run at most once even under concurrent Close + // callers. + closeOnce sync.Once +} + +// transport returns the bound generation's transport. Callers should +// re-load on every use rather than caching, since the pointer is +// swapped atomically on transparent failover. Never returns nil for +// a live client; Close stores nil but the closed flag short-circuits +// any subsequent call before transport() is read. +func (c *QwpQueryClient) transport() *qwpTransport { + return c.transportPtr.Load() +} + +// io returns the bound generation's I/O goroutine pair. See transport(). +func (c *QwpQueryClient) io() *qwpEgressIO { + return c.ioPtr.Load() +} + +// publishGeneration atomically swaps the bound transport + I/O + the +// connect-walk metadata. Used by both the initial connect path and +// the failover reconnect path so the publish ordering stays +// consistent across both. Holds genMu so two concurrent transport +// faults cannot both spawn a new generation. +func (c *QwpQueryClient) publishGeneration(r *qwpConnectResult) { + c.transportPtr.Store(r.transport) + c.ioPtr.Store(r.io) + c.currentEndpointIdx.Store(int32(r.endpointIdx)) + c.serverInfo.Store(r.serverInfo) +} + +// ServerInfo returns the SERVER_INFO frame consumed during the bound +// generation's WebSocket handshake, or nil if the client did not +// consume one (serverInfoTimeout disabled or no parseable frame). The +// returned pointer is owned by the client and is replaced atomically +// on each transparent failover reconnect; callers that need to retain +// a value across a possible reconnect should copy out the fields. +func (c *QwpQueryClient) ServerInfo() *QwpServerInfo { + return c.serverInfo.Load() +} + +// CurrentEndpoint returns the host:port string of the endpoint the +// client is currently bound to. Updated atomically on each transparent +// failover reconnect. Returns the empty string before the constructor +// has completed. +func (c *QwpQueryClient) CurrentEndpoint() string { + idx := int(c.currentEndpointIdx.Load()) + if idx < 0 || idx >= len(c.cfg.endpoints) { + return "" + } + return c.cfg.endpoints[idx].String() +} + +// QwpBindFunc populates the typed bind parameters for a single Query +// or Exec call. The function is invoked on the caller's goroutine +// before the query is submitted. Setters must be invoked in strictly +// ascending index order starting at 0; the latched error on QwpBinds +// is surfaced as the query's first result. +type QwpBindFunc func(*QwpBinds) + +// QwpQueryOption is a functional option for Query / Exec that attaches +// per-call settings — currently just bind parameters. Named for prefix +// consistency with QwpQueryClientOption (the constructor option type). +type QwpQueryOption func(*qwpQueryOptions) + +// qwpQueryOptions collects the effective settings for a single Query +// or Exec invocation. Private so the public surface is the option +// constructors, not the struct itself. +type qwpQueryOptions struct { + bindFn QwpBindFunc +} + +// WithQwpQueryBinds attaches a bind-parameter setter to a Query or Exec +// call. The setter runs on the caller's goroutine and receives a reusable +// *QwpBinds sink. Placeholders in the SQL text are $1, $2, ...; the +// corresponding setter calls use 0-based indexes. Setters must be +// invoked in strictly ascending index order with no gaps; a duplicate +// or out-of-order index surfaces the error through the query result. +func WithQwpQueryBinds(fn QwpBindFunc) QwpQueryOption { + return func(o *qwpQueryOptions) { o.bindFn = fn } +} + +// QwpQueryClientOption is a functional option for NewQwpQueryClient. +// Deliberately a distinct type from LineSenderOption — the two clients +// share no transport code above qwpTransport, and using a different +// option type prevents misuse (e.g. passing an ingest option to the +// query constructor). +type QwpQueryClientOption func(*qwpQueryClientConfig) + +// WithQwpQueryAddress overrides the default "localhost:9000" server +// address. Accepts a single "host:port" or a comma-separated list of +// endpoints; the latter is equivalent to WithQwpQueryEndpoints. The +// connect walk uses the first endpoint matching the target= filter. +// Errors during parsing are deferred to validate(), so a malformed +// addr surfaces from the client constructor. +func WithQwpQueryAddress(addr string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + eps, err := parseEndpointList(addr, qwpDefaultPort) + if err != nil { + // Stash a sentinel single-entry list with the bad address + // so validate() surfaces a useful error from the + // originating field; the err itself is not wired through + // the options API. Keep at least one entry so validate's + // "no endpoints" path is not also tripped. + c.endpoints = []qwpEndpoint{{host: addr, port: 0}} + return + } + c.endpoints = eps + } +} + +// WithQwpQueryEndpoints sets the ordered list of endpoints the connect +// walk attempts. Each entry is a "host[:port]" string; missing port +// defaults to qwpDefaultPort. Errors during parsing are deferred to +// validate() so the client constructor surfaces them. Use this option +// when the configured endpoints are typed at the call site (e.g., a +// service-discovery layer); WithQwpQueryAddress with a comma-separated +// list is equivalent. +func WithQwpQueryEndpoints(addrs ...string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + joined := strings.Join(addrs, ",") + eps, err := parseEndpointList(joined, qwpDefaultPort) + if err != nil { + c.endpoints = []qwpEndpoint{{host: joined, port: 0}} + return + } + c.endpoints = eps + } +} + +// WithQwpQueryEndpointPath overrides the default "/read/v1" WebSocket +// upgrade path. Rarely needed — present for parity with Java's +// withEndpointPath and to support reverse-proxy rewrites. +func WithQwpQueryEndpointPath(path string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.endpointPath = path } +} + +// WithQwpQueryAuth sets the raw Authorization HTTP header value sent +// on the WebSocket upgrade. Mutually exclusive with +// WithQwpQueryBasicAuth and WithQwpQueryBearerToken. +func WithQwpQueryAuth(authHeader string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.authorization = authHeader } +} + +// WithQwpQueryBasicAuth enables HTTP Basic authentication. The server +// validates against the same user store that the Postgres wire +// protocol uses — a user created via CREATE USER ... WITH PASSWORD ... +// works unchanged. +func WithQwpQueryBasicAuth(username, password string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + c.httpUser = username + c.httpPass = password + } +} + +// WithQwpQueryBearerToken enables HTTP Bearer authentication with an +// OIDC access token. The server verifies the token via its configured +// OIDC provider. +func WithQwpQueryBearerToken(token string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.httpToken = token } +} + +// WithQwpQueryClientID overrides the default X-QWP-Client-Id header +// sent on the WebSocket upgrade. Empty uses the module default. +func WithQwpQueryClientID(id string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.clientID = id } +} + +// WithQwpQueryBufferPoolSize overrides the decode buffer pool depth. +// Larger pools let the dispatcher decode further ahead of a slow +// consumer; smaller pools reduce memory but stall the dispatcher +// sooner. Must be >= 1. +func WithQwpQueryBufferPoolSize(size int) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.bufferPoolSize = size } +} + +// WithQwpQueryMaxBatchRows asks the server to cap each RESULT_BATCH +// at the given row count. 0 omits the header and lets the server use +// its own cap. Useful for latency-sensitive streaming consumers that +// want the first rows sooner. +func WithQwpQueryMaxBatchRows(rows int) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.maxBatchRows = rows } +} + +// WithQwpQueryInitialCredit opts the next query into credit-based +// egress flow control with the given initial byte budget. The server +// streams at most `bytes` of result payload before pausing; the +// client auto-replenishes by the size of each batch after the +// consumer releases it. 0 (the default) disables flow control. +func WithQwpQueryInitialCredit(bytes int64) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.initialCredit = bytes } +} + +// WithQwpQueryCompression selects the compression codec advertised to +// the server on the WebSocket upgrade. Accepted values: "raw" (default, +// no compression, accept-encoding header omitted), "zstd" (demand zstd, +// fall back to raw if the server cannot), "auto" (advertise both and +// let the server pick). Anything else surfaces as an error from the +// constructor. Matches Java QwpQueryClient.withCompression's +// preference argument. +func WithQwpQueryCompression(preference string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.compression = preference } +} + +// WithQwpQueryCompressionLevel overrides the zstd compression level +// hint the client sends in the accept-encoding header. Ignored when +// the compression preference is "raw". Accepts [1, 22] matching +// Java; the server clamps down to its own supported range. +func WithQwpQueryCompressionLevel(level int) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.compressionLevel = level } +} + +// WithQwpQueryTls enables TLS with full certificate validation against +// the system cert pool. +func WithQwpQueryTls() QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.tlsMode = tlsEnabled } +} + +// WithQwpQueryTarget restricts the connect walk to endpoints whose +// SERVER_INFO.role passes the given filter: QwpTargetAny (default, +// matches any role), QwpTargetPrimary (STANDALONE | PRIMARY | +// PRIMARY_CATCHUP), or QwpTargetReplica (REPLICA only). Mirrors Java's +// withTarget. An out-of-range value is surfaced by the client +// constructor via validate(). +// +// QwpTargetPrimary or QwpTargetReplica requires the server role from +// SERVER_INFO; if the client does not consume SERVER_INFO the role is +// unknown and a role-specific filter cannot be satisfied. +func WithQwpQueryTarget(target QwpTargetFilter) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + c.target = target + } +} + +// WithQwpQueryFailover toggles transparent reconnect-and-replay on +// transport-terminal failure mid-query. Default true; matches Java's +// failover=on default. When false, transport errors surface directly +// through Batches() / Exec(). +func WithQwpQueryFailover(enabled bool) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.failoverEnabled = enabled } +} + +// WithQwpQueryFailoverMaxAttempts caps the number of executeOnce +// invocations per Query / Exec call. Counts the initial attempt plus +// every reconnect retry. Must be >= 1; the default +// (qwpDefaultFailoverMaxAttempts = 8) matches Java. +func WithQwpQueryFailoverMaxAttempts(n int) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.failoverMaxAttempts = n } +} + +// WithQwpQueryFailoverBackoff sets the exponential backoff between +// reconnect attempts. initial is the first sleep (doubled per retry); +// max is the ceiling. Defaults match Java +// (qwpDefaultFailoverInitialBackoff = 50ms, +// qwpDefaultFailoverMaxBackoff = 1s). +func WithQwpQueryFailoverBackoff(initial, max time.Duration) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + c.failoverBackoffInitial = initial + c.failoverBackoffMax = max + } +} + +// WithQwpQueryFailoverMaxDuration caps the total wall-clock time the +// per-Query / Exec failover loop spends reconnecting and replaying. +// Whichever of this or WithQwpQueryFailoverMaxAttempts fires first +// ends the loop. 0 disables the time cap (failover then bounded only +// by attempts). Must be >= 0; the default +// (qwpDefaultFailoverMaxDuration = 30s) matches Java's +// DEFAULT_FAILOVER_MAX_DURATION_MS. +func WithQwpQueryFailoverMaxDuration(d time.Duration) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.failoverMaxDuration = d } +} + +// WithQwpQueryServerInfoTimeout overrides the SERVER_INFO read +// deadline applied during each WebSocket upgrade. Default +// qwpDefaultServerInfoTimeout (5s) matches Java's +// DEFAULT_SERVER_INFO_TIMEOUT_MS. Must be > 0: the server always emits +// SERVER_INFO as the first post-upgrade frame, so skipping the +// synchronous drain would leave that frame in the recv buffer where +// the I/O loop would later misread it as a query response. +func WithQwpQueryServerInfoTimeout(d time.Duration) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.serverInfoTimeout = d } +} + +// WithQwpQueryZone sets the client's opaque, case-insensitive +// locality hint (failover.md §1.1). When set and target != primary, +// the connect/reconnect walk prefers endpoints whose server-advertised +// zone (SERVER_INFO.zone_id under CAP_ZONE, or the X-QuestDB-Zone +// header on a 421 reject) matches, via the (state, zone) priority +// lattice. Empty (the default) is zone-blind. Mirrors the ingest +// WithQwpZone / zone= key so a connect string can be shared verbatim. +func WithQwpQueryZone(zone string) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.zone = zone } +} + +// WithQwpQueryAuthTimeout overrides the per-host upgrade-response-read +// bound (failover.md §1.1). It bounds only the wait between writing +// the WebSocket upgrade request and reading the response headers — not +// TCP connect, TLS handshake, or the SERVER_INFO read (see +// WithQwpQueryServerInfoTimeout). Must be > 0; the default +// (qwpDefaultAuthTimeoutMs = 15s) matches the ingest client and Java. +// Sub-millisecond durations round down and are rejected by validate(). +func WithQwpQueryAuthTimeout(d time.Duration) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { + c.authTimeoutMs = int(d.Milliseconds()) + } +} + +// WithQwpQueryReplayExec opts Exec into transparent replay on +// transport-terminal failure. Default false because non-idempotent +// statements (INSERT / UPDATE / DELETE / DDL) might double-execute +// if the server applied the statement before the transport drop was +// detected. Callers that know their statements are idempotent can +// opt in to match Java's transparent replay behaviour. +func WithQwpQueryReplayExec(enabled bool) QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.replayExec = enabled } +} + +// WithQwpQueryTlsInsecureSkipVerify enables TLS but skips certificate +// validation. Intended for testing only. +func WithQwpQueryTlsInsecureSkipVerify() QwpQueryClientOption { + return func(c *qwpQueryClientConfig) { c.tlsMode = tlsInsecureSkipVerify } +} + +// NewQwpQueryClient constructs a QwpQueryClient from functional options +// and opens the WebSocket connection. Matches Java +// QwpQueryClient.newPlainText + connect(), but bundled into one call +// since Go does not usually separate construction from connection. +func NewQwpQueryClient(ctx context.Context, opts ...QwpQueryClientOption) (*QwpQueryClient, error) { + cfg := qwpQueryDefaultConfig() + for _, opt := range opts { + opt(cfg) + } + return newQwpQueryClient(ctx, cfg) +} + +// QwpQueryClientFromConf constructs a QwpQueryClient from a ws:: / +// wss:: config string and opens the WebSocket connection. See +// parseQwpQueryConf for the full key reference. +func QwpQueryClientFromConf(ctx context.Context, conf string) (*QwpQueryClient, error) { + cfg, err := parseQwpQueryConf(conf) + if err != nil { + return nil, err + } + return newQwpQueryClient(ctx, cfg) +} + +// newQwpQueryClient is the internal factory shared by both public +// entry points. It performs validation, runs the multi-endpoint +// connect walk, and spawns the I/O goroutines for the bound +// generation. The walk applies the target= role filter against the +// SERVER_INFO frame each endpoint emits. +func newQwpQueryClient(ctx context.Context, cfg *qwpQueryClientConfig) (*QwpQueryClient, error) { + if err := cfg.validate(); err != nil { + return nil, err + } + // Early probe: if we told the server we can accept zstd, round- + // trip a transient decoder so any klauspost/compress init failure + // surfaces here on the user goroutine rather than mid-stream on + // the first compressed batch. Matches Java's probeZstdAvailable + // in intent; cheaper in pure Go since there is no JNI library to + // load. Run before the dial so a misbehaving zstd binding does + // not leak a half-open WebSocket. + if cfg.compression != qwpCompressionRaw { + if err := probeZstdAvailable(); err != nil { + return nil, err + } + } + + c := &QwpQueryClient{ + cfg: cfg, + nextRequestId: 1, // match Java's QwpQueryClient.nextRequestId initial value + // Fresh tracker: every host starts Unknown with attempted=false, + // so the first PickNext sweep walks the addr= list in order + // (failover.md §2 selection priority — ties break on the + // user-supplied order). zone= and target= shape the (state, + // zone) lattice from here on. Mirrors Java connect()'s + // hostTracker==null branch (no BeginRound on a fresh tracker). + hostTracker: newQwpHostTracker(len(cfg.endpoints), cfg.zone, cfg.target), + } + c.currentEndpointIdx.Store(-1) + + // allowFallthroughReset=false: initial connect probes each endpoint + // exactly once (Java connect() parity), no re-sweep on a uniformly + // rejecting cluster. + result, err := connectWalk(ctx, cfg, c.hostTracker, nil, false) + if err != nil { + return nil, err + } + c.publishGeneration(result) + return c, nil +} + +// errClosedDuringFailover is the typed cause surfaced to the in-flight +// query when Close races a reconnect: the client is shutting down, so +// the failover loop must terminate rather than bind a fresh generation +// nothing will ever tear down. Distinct from the "client is closed" +// string returned by Query/Exec at submit time so logs can tell a +// close-before-submit apart from a close-mid-failover. +var errClosedDuringFailover = errors.New( + "qwp query: client closed during failover") + +// reconnectAndReplay tears down the current generation, demotes the +// just-failed endpoint and walks the host tracker by (state, zone) +// priority (failover.md §2; the demoted host drops to TransportError +// so a healthier or same-zone peer is preferred, but it stays a +// candidate and is retried if nothing better binds — including the +// n=1 case), publishes the new generation, and resubmits the +// in-flight query with a fresh requestId. Returns the new +// generation's QwpServerInfo (nil if none consumed) or a non-nil error if the +// walk fails. Holds c.genMu for the duration of the swap so two +// concurrent transport faults serialise and so a concurrent Close +// cannot interleave with the swap. +// +// Close coordination: Close sets c.closed and snapshots the bound +// generation under c.genMu. Because this function holds c.genMu for +// its whole body, c.closed cannot change underneath it, so a single +// check before any work decides the outcome: +// +// - closed already set (Close won the lock first): Close has +// already snapshotted and owns teardown of the bound generation. +// Bail before touching it (a second teardown here would race +// Close's unlocked tr.close()) and before standing up a fresh +// generation Close could never reach. +// +// - closed set only after this returns (Close is blocked on +// c.genMu): we publish normally; Close then snapshots and tears +// down the generation we just published. +// +// The post-connectWalk re-check is belt-and-suspenders: with closed +// written under c.genMu it is unreachable, but it keeps this function +// locally correct (no leaked generation) even if a future closed- +// setter forgoes the lock. +// +// Mirrors the high-level shape of Java's reconnectViaTracker + +// executeOnce composition. +func (c *QwpQueryClient) reconnectAndReplay(ctx context.Context, s *qwpQuerySession, failedIdx int) (*QwpServerInfo, error) { + c.genMu.Lock() + defer c.genMu.Unlock() + + if c.closed.Load() { + return nil, errClosedDuringFailover + } + + // Tear down the dying generation. Use the cleanup-bounded ctx + // independent of the user's so the dispatcher's exit waits a + // fixed budget regardless of what the caller's deadline says. + cleanupCtx, cancel := context.WithTimeout( + context.Background(), qwpQueryCleanupDrainTimeout) + defer cancel() + if oldIO := c.io(); oldIO != nil { + _ = oldIO.shutdown(cleanupCtx) + } + if oldTr := c.transport(); oldTr != nil { + _ = oldTr.close() + } + + // Demote the just-failed endpoint, then open a fresh round. Order + // is normative (failover.md §2.3): RecordMidStreamFailure must run + // BEFORE the round reset, else sticky-Healthy would preserve the + // just-failed host as the priority pick and hand it the first + // reconnect attempt again. RecordMidStreamFailure only demotes a + // still-Healthy slot and leaves `attempted` untouched; the + // subsequent BeginRound(forgetClassifications=false) clears the + // per-round bits but keeps topology classifications observed in + // prior Executes (wire-egress.md §11.9.2 "lazy forget"). The one + // fall-through BeginRound(true) lives inside connectWalk. n=1 + // degenerates cleanly: the lone host is demoted to TransportError, + // PickNext still returns it, and the walk retries the same host — + // the only candidate — instead of failing for lack of an + // alternative. + c.hostTracker.RecordMidStreamFailure(failedIdx) + c.hostTracker.BeginRound(false) + // Pass s.cancelCh so the walk short-circuits at endpoint + // boundaries when the user calls Cancel mid-failover. + // allowFallthroughReset=true: one BeginRound(true) re-sweep so a + // long-lived client recovers from a topology change (Java + // reconnectViaTracker parity). + result, err := connectWalk(ctx, c.cfg, c.hostTracker, s.cancelCh, true) + if err != nil { + return nil, err + } + if c.closed.Load() { + // Defensive: see the doc comment. connectWalk already spawned + // the new generation's I/O goroutines + WebSocket, so tear them + // down here rather than publish an orphan nothing will shut down. + _ = result.io.shutdown(cleanupCtx) + _ = result.transport.close() + return nil, errClosedDuringFailover + } + c.publishGeneration(result) + + // Allocate a fresh requestId for the replay attempt. Matches + // Java's nextRequestId++ on each executeOnce: the server treats + // each attempt as a distinct query (the prior server's request + // is now orphaned by the dropped connection). + newReqID := c.nextRequestId + c.nextRequestId++ + s.currentRequestId.Store(newReqID) + if err := s.submit(ctx); err != nil { + return nil, fmt.Errorf("qwp query: replay submit failed: %w", err) + } + // Re-issue the cancel if Cancel landed during the reconnect. + // session.requestCancel reads (currentRequestId, c.io()) without + // a lock, so a Cancel racing this function can pick up either + // the OLD request_id paired with the NEW io (the window between + // publishGeneration and currentRequestId.Store above — the new + // dispatcher's top-of-loop CAS then clears the OLD id as a stale + // "prior-query" cancel) or the OLD request_id paired with the OLD + // io (the window before publishGeneration — the cancel atomic is + // set on a torn-down dispatcher that will never read it). In both + // cases the user's Cancel intent is silently dropped and the + // replay runs to completion. Re-issuing here against the now- + // stable (newReqID, c.io()) pair lands one CANCEL frame on the + // wire: the dispatcher either matches newReqID in its CAS loop + // (no clear) or picks it up via drainPendingCancel in + // receiveLoop. + if s.isCancelled() { + c.io().requestCancel(newReqID) + } + return result.serverInfo, nil +} + +// probeZstdAvailable allocates and immediately closes a zstd decoder +// so init-time failures (allocation pressure, bundled-library issues) +// surface synchronously at construction time. The Go port is simpler +// than Java's because klauspost/compress is pure Go — there is no +// native library to be missing. The probe still serves as a small +// sanity gate and matches Java's ordering (init after upgrade so +// transport errors surface first). +func probeZstdAvailable() error { + dec, err := zstd.NewReader(nil, zstd.WithDecoderConcurrency(1)) + if err != nil { + return fmt.Errorf("qwp query: zstd decoder init failed: %w", err) + } + dec.Close() + return nil +} + +// effectiveAuthorization computes the Authorization header value +// from the config, resolving the three mutually-exclusive auth modes +// into a single header string. +func (c *qwpQueryClientConfig) effectiveAuthorization() string { + if c.authorization != "" { + return c.authorization + } + if c.httpUser != "" && c.httpPass != "" { + creds := c.httpUser + ":" + c.httpPass + return "Basic " + base64.StdEncoding.EncodeToString([]byte(creds)) + } + if c.httpToken != "" { + return "Bearer " + c.httpToken + } + return "" +} + +// Close shuts down the I/O goroutines, sends a WebSocket close frame, +// and releases the underlying connection. Safe to call more than +// once; subsequent calls return nil. Safe to call from a goroutine +// other than the one driving Query/Exec, including while a Batches() +// iteration or Exec() is mid transparent-failover reconnect. +// +// Calling Close while a *QwpQuery.Batches() loop body is still using +// the batch's aliased []byte slices is undefined: the transport may +// free buffers the caller is still reading. The right way to unblock +// an in-flight iterator from another goroutine is Cancel (or cancel +// the Query/Exec context); Close then races at most the generation +// teardown, never the buffer aliasing. +func (c *QwpQueryClient) Close(ctx context.Context) error { + var firstErr error + c.closeOnce.Do(func() { + // Set closed and snapshot the bound (io, transport) pair under + // genMu. This is what makes Close safe against a concurrent + // reconnectAndReplay: it holds genMu across its whole destroy- + // old / build-new / publish swap, so under the lock we observe + // exactly one consistent generation — never a torn pair half- + // way through publishGeneration — and reconnectAndReplay + // observes our closed flag and self-tears-down (or skips + // building) any generation we are not the one tearing down. + // See reconnectAndReplay's doc for the full interleaving table. + // The shutdown/close run after Unlock so genMu is never held + // across a user-facing wait. + c.genMu.Lock() + c.closed.Store(true) + io := c.io() + tr := c.transport() + c.genMu.Unlock() + + if io != nil { + if err := io.shutdown(ctx); err != nil { + firstErr = err + } + } + if tr != nil { + if err := tr.close(); err != nil && firstErr == nil { + firstErr = err + } + } + }) + return firstErr +} + +// Query submits a SELECT-style statement and returns a cursor over its +// result batches. The server-side execution begins immediately; the +// cursor drains events lazily as the caller ranges over Batches(). +// +// Per-call options are supplied via the variadic opts list — see +// WithQwpQueryBinds for attaching typed bind parameters. Repeating the +// same SQL text across calls hits the server's SQL-text-keyed factory +// cache; interpolating values into the SQL string defeats that reuse, +// use WithQwpQueryBinds instead. +// +// Query never returns an error directly: any failure raised at submit +// time (closed client, bind setter error, ctx-cancelled submit) is +// latched on the returned *QwpQuery and yielded as the first element of +// Batches(). Callers MUST iterate Batches() to observe submit failures; +// dropping the cursor without ranging it discards the latched error +// silently. Use Exec for statements where the synchronous error +// signature is more natural. +// +// Err on a wrong statement kind also surfaces through the first +// Batches() yield: if the server sends EXEC_DONE (non-SELECT +// statement), the iterator yields (nil, error) and terminates. Use +// Exec for statements that do not produce a result set. +// +// Breaking out of the range loop early sends a CANCEL frame to the +// server and drains the remaining events until a terminal frame +// arrives. Always defer (*QwpQuery).Close() to guarantee cleanup on +// any path. +func (c *QwpQueryClient) Query(ctx context.Context, sql string, opts ...QwpQueryOption) *QwpQuery { + q := &QwpQuery{ + client: c, + ctx: ctx, + sql: sql, + } + if c.closed.Load() { + q.pendingErr = errors.New("qwp query: client is closed") + q.state.Store(qwpQueryStateDone) + return q + } + req, err := c.buildRequest(sql, opts) + if err != nil { + q.pendingErr = err + q.state.Store(qwpQueryStateDone) + return q + } + q.requestId = req.requestId + // SELECT is idempotent: transparent reconnect-and-replay on a + // transport drop is always safe, so the session is replayable + // regardless of replay_exec (which only governs Exec). + q.session = newQwpQuerySession(c, req, true) + if err := q.session.submit(ctx); err != nil { + q.pendingErr = err + q.state.Store(qwpQueryStateDone) + } + return q +} + +// Exec runs a non-SELECT statement (DDL / INSERT / UPDATE / ...) and +// blocks until the server returns EXEC_DONE or a terminal error. On +// success returns the ExecResult (op type + rows affected). On a +// QUERY_ERROR frame the returned error is a *QwpQueryError; on a +// transport or decode failure it is a plain error. +// +// Per-call options are supplied via the variadic opts list — see +// WithQwpQueryBinds for attaching typed bind parameters. +// +// Calling Exec on a SELECT statement returns an error — SELECT sends +// RESULT_BATCH + RESULT_END, which Exec does not expect. Use Query +// for SELECTs. +func (c *QwpQueryClient) Exec(ctx context.Context, sql string, opts ...QwpQueryOption) (ExecResult, error) { + if c.closed.Load() { + return ExecResult{}, errors.New("qwp query: client is closed") + } + req, err := c.buildRequest(sql, opts) + if err != nil { + return ExecResult{}, err + } + reqId := req.requestId + + // Exec replays on a transport drop only when the caller opted in + // via replay_exec=on. Default off: a non-idempotent statement the + // server may already have applied must not be silently re-executed + // on the reconnect — nextEvent surfaces the raw transport error + // instead (see qwpQuerySession.replayable). + session := newQwpQuerySession(c, req, c.cfg.replayExec) + if err := session.submit(ctx); err != nil { + return ExecResult{}, err + } + + for { + ev, err := session.nextEvent(ctx) + if err != nil { + // ctx expired or I/O terminated before we saw a terminal + // event. Cancel + drain on a cleanup ctx so the dispatcher + // returns to idle; otherwise the next Query/Exec on this + // client blocks on the single-slot requests channel. + // Route through the session so cancel targets the live + // generation's request_id even after a transparent failover + // reconnect (where the session's currentRequestId diverges + // from reqId). + session.requestCancel() + cleanupCtx, cleanupCancel := context.WithTimeout( + context.Background(), qwpQueryCleanupDrainTimeout) + _ = drainUntilTerminal(cleanupCtx, c.io()) + cleanupCancel() + return ExecResult{}, err + } + switch ev.kind { + case qwpEventKindExecDone: + return ev.execResult, nil + case qwpEventKindError: + return ExecResult{}, eventToError(ev, reqId) + case qwpEventKindTransportError: + // The session has already exhausted its replay budget (or + // failover was disabled). Surface the underlying transport + // error so callers can errors.Is / errors.As against the + // cause without picking up *QwpQueryError (which carries + // server-status bytes that are meaningless for client- + // side faults). + return ExecResult{}, transportEventError(ev) + case qwpEventKindFailoverReset: + // Only reachable when this Exec opted into replay + // (replay_exec=on): the session passes c.cfg.replayExec as + // its replayable flag, and nextEvent emits this event only + // for a replayable session — a non-idempotent Exec with + // replay_exec=off is short-circuited to a raw transport + // error before any reconnect, so it never double-executes. + // Here the session already reconnected and resubmitted + // transparently; the reset is informational. Consume the + // new generation's terminal event on the next iteration. + case qwpEventKindBatch: + // Server streamed a result batch for what we asked for as + // an exec. Release the buffer, send a CANCEL so the + // server stops streaming the rest of the result set, and + // drain to a terminal frame on a cleanup-bounded context + // so the dispatcher returns to idle regardless of the + // caller's ctx. Then surface the type-mismatch. Cancel + // routes through the session so it targets the live + // generation's request_id even after a transparent + // failover reconnect. + c.io().releaseBuffer(ev.batch) + session.requestCancel() + cleanupCtx, cancel := context.WithTimeout( + context.Background(), qwpQueryCleanupDrainTimeout) + _ = drainUntilTerminal(cleanupCtx, c.io()) + cancel() + return ExecResult{}, fmt.Errorf( + "qwp query: Exec called on a SELECT-style statement; use Query instead") + case qwpEventKindEnd: + // Bare RESULT_END with no preceding RESULT_BATCH — same + // misuse as above (user ran a SELECT via Exec). + return ExecResult{}, fmt.Errorf( + "qwp query: Exec called on a SELECT-style statement; use Query instead") + default: + return ExecResult{}, fmt.Errorf("qwp query: unexpected event kind %d", ev.kind) + } + } +} + +// buildRequest assembles the qwpRequest for a Query / Exec call. The +// bind setter runs on the caller's goroutine against the client's +// reusable QwpBinds scratch; the encoded bytes are then copied into a +// fresh per-request slice so the dispatcher's read of bindPayload is +// always against a request-owned buffer, independent of what the +// caller does with the scratch afterwards. +func (c *QwpQueryClient) buildRequest(sql string, opts []QwpQueryOption) (qwpRequest, error) { + if len(sql) > qwpMaxSqlTextBytes { + return qwpRequest{}, fmt.Errorf( + "qwp query: SQL text length %d exceeds %d-byte limit", + len(sql), qwpMaxSqlTextBytes) + } + var settings qwpQueryOptions + for _, opt := range opts { + opt(&settings) + } + c.binds.reset() + if settings.bindFn != nil { + settings.bindFn(&c.binds) + if err := c.binds.Err(); err != nil { + return qwpRequest{}, err + } + } + var bindPayload []byte + if src := c.binds.bufferBytes(); len(src) > 0 { + bindPayload = append([]byte(nil), src...) + } + reqId := c.nextRequestId + c.nextRequestId++ + return qwpRequest{ + sql: sql, + requestId: reqId, + initialCredit: c.cfg.initialCredit, + bindCount: c.binds.Count(), + bindPayload: bindPayload, + }, nil +} + +// drainUntilTerminal reads and discards events until a terminal one +// (End / ExecDone / Error / TransportError) arrives. Releases any +// batch buffers along the way. Returns a transport/context error if +// takeEvent fails. Includes TransportError because a poisoned +// connection's pending events will be one of these — looping past +// would block forever waiting for an End the I/O goroutine will +// never emit. +func drainUntilTerminal(ctx context.Context, io *qwpEgressIO) error { + for { + ev, err := io.takeEvent(ctx) + if err != nil { + return err + } + switch ev.kind { + case qwpEventKindBatch: + io.releaseBuffer(ev.batch) + case qwpEventKindEnd, qwpEventKindExecDone, qwpEventKindError, + qwpEventKindTransportError: + return nil + } + } +} + +// eventToError converts a qwpEventKindError event into the most +// specific Go error type available. Server-sent QUERY_ERROR frames +// (status > 0) become *QwpQueryError; synthesized client-side errors +// (status == 0, set by emitError) stay as plain errors. +func eventToError(ev qwpEvent, reqId int64) error { + if ev.errStatus != 0 { + id := ev.requestId + if id == 0 { + id = reqId + } + return &QwpQueryError{ + RequestId: id, + Status: ev.errStatus, + Message: ev.errMessage, + } + } + if ev.errMessage != "" { + return errors.New(ev.errMessage) + } + return errors.New("qwp query: unspecified error") +} + +// transportEventError converts a qwpEventKindTransportError into a +// caller-facing error. When transportErr is set (failover orchestrator +// path), wraps with %w so errors.As can match the underlying typed +// cause (e.g. *QwpRoleMismatchError from a failed reconnect walk). +// Falls back to a plain string-formatted error for I/O-goroutine +// emissions that only carry errMessage. +func transportEventError(ev qwpEvent) error { + if ev.transportErr != nil { + return fmt.Errorf("qwp query: %w", ev.transportErr) + } + return fmt.Errorf("qwp query: %s", ev.errMessage) +} + +// Query lifecycle states. Transitions are linear: Idle → Iterating → +// Done, or Idle → Done (if Close runs before Batches is entered, or +// submit failed so the query is Done from construction). Coordination +// between Close and Batches is done via CAS on this state — see the +// per-method comments for the exact handshake. +const ( + qwpQueryStateIdle int32 = iota + qwpQueryStateIterating + qwpQueryStateDone +) + +// QwpQuery is a streaming cursor over a SELECT result set returned by +// (*QwpQueryClient).Query. It is single-use: once the range over +// Batches() terminates (by End, Error, or break), the cursor is done +// and must not be iterated again. +// +// Thread safety: Batches and the buffers it yields are single-consumer +// — do not share the cursor across goroutines. Cancel is safe to call +// from other goroutines at any time. Close is safe to call from other +// goroutines too, but is a no-op while a Batches iteration is in +// flight: the iterator runs its own cancel+drain on every exit path, +// so a concurrent Close would only race it for the dispatcher's +// single terminal event. To unblock a hung iterator from another +// goroutine, use Cancel (or cancel the context passed to Query). +type QwpQuery struct { + client *QwpQueryClient + ctx context.Context + sql string + + // session orchestrates submission and event consumption, + // including transparent reconnect-and-replay on transport- + // terminal failure. Owns the in-flight requestId across replays; + // the requestId field below is the *initial* attempt's id and is + // used only for diagnostics (RequestId accessor). + session *qwpQuerySession + + // requestId is the initial (first-attempt) client-assigned id. + // Surfaced via RequestId for log correlation; on replay the + // session's currentRequestId diverges. Cancel routes through the + // session so it always targets the live generation. + requestId int64 + + // totalRows is set when a RESULT_END frame arrives. Read via + // TotalRows(). Default 0 on a query that never reached End + // (cancelled, errored, or still running). Atomic because the + // iterator goroutine in Batches() writes it while a sibling + // goroutine (e.g. cancel/observer) may call TotalRows(). + totalRows atomic.Int64 + + // pendingErr holds an error surfaced at submit time (closed + // client, submit blocked on ctx cancel). Yielded on the first + // iteration of Batches() so callers discover it naturally. + pendingErr error + + // state is the lifecycle phase (see qwpQueryState* constants). + // Batches() enters via CAS(Idle→Iterating); Close() takes + // ownership of cleanup only via CAS(Idle→Done). Either defer + // flips to Done on exit. A failed CAS in Close means an iterator + // is active (and will clean up itself) or the query is already + // done — both cases are no-ops. + state atomic.Int32 + + // cancelled records whether Cancel() has been invoked. Used to + // avoid emitting a synthesized "cancelled by caller" error on top + // of the server's QUERY_ERROR(status=CANCELLED) echo. + cancelled atomic.Bool +} + +// Batches returns a range-over-func iterator that yields each +// RESULT_BATCH frame as a *QwpColumnBatch, along with an optional +// error. The iterator terminates on RESULT_END (clean end), a +// QUERY_ERROR from the server (yielded as the last element's error), +// a transport/decode failure (same), or the caller breaking out of +// the range loop (sends CANCEL to the server, drains remaining +// events). +// +// The yielded *QwpColumnBatch is only valid inside the body of the +// current iteration — its slices alias the pool-owned decode buffer +// and will be reused for the next batch. Use batch.CopyAll() to +// retain data across iterations. +func (q *QwpQuery) Batches() iter.Seq2[*QwpColumnBatch, error] { + return func(yield func(*QwpColumnBatch, error) bool) { + // CAS Idle→Iterating grabs the iteration slot and also locks + // out a concurrent Close from running its own drain. On + // failure the query is already Done (either Close won the + // race, a prior iteration ran, or submit failed) — surface + // pendingErr once and stop. + if !q.state.CompareAndSwap(qwpQueryStateIdle, qwpQueryStateIterating) { + if q.pendingErr != nil { + yield(nil, q.pendingErr) + q.pendingErr = nil + } + return + } + defer q.state.Store(qwpQueryStateDone) + + for { + ev, err := q.session.nextEvent(q.ctx) + if err != nil { + // takeEvent returned before a terminal frame (most + // often q.ctx expired while we were waiting on the + // server). The dispatcher is still parked in + // receiveLoop for this query, so cancel + drain on a + // cleanup ctx before returning — symmetrical to the + // !keepGoing break-out below. The caller's deferred + // Close() sees state=Done (flipped by the defer on + // this function) and becomes a no-op; without this + // drain the dispatcher would stay stuck and strand + // the client for follow-up Query/Exec. + yield(nil, err) + q.cancelAndDrainOnCleanupCtx() + return + } + switch ev.kind { + case qwpEventKindBatch: + keepGoing := false + func() { + // Release the buffer even if the caller's yield + // body panics. Without this, a single panic with + // bufferPoolSize=1 permanently starves the pool, + // and the dispatcher — still parked in receiveLoop + // for this query — blocks the next Query/Exec. + // On panic we also run the cancel+drain before + // rethrowing: the outer `defer q.state.Store(Done)` + // has already flipped the state, so the caller's + // defer q.Close() would otherwise be a no-op and + // leave the dispatcher stranded. + defer func() { + q.client.io().releaseBuffer(ev.batch) + if r := recover(); r != nil { + q.cancelAndDrainOnCleanupCtx() + panic(r) + } + }() + keepGoing = yield(&ev.batch.batch, nil) + }() + if !keepGoing { + // User broke out — request cancel and drain the + // remaining events until a terminal frame so the + // dispatcher returns to idle and the next query + // can submit cleanly. Drain uses a bounded cleanup + // ctx independent of q.ctx because a common reason + // to break out is exactly that q.ctx has expired. + q.cancelAndDrainOnCleanupCtx() + return + } + case qwpEventKindEnd: + q.totalRows.Store(ev.totalRows) + return + case qwpEventKindError: + // A server-sent cancellation echo (status=Cancelled) + // in response to our own Cancel call is not an error + // the caller needs to see — yielding it would make a + // clean "I broke out of the loop" look like a + // failure. Swallow that one case. + if q.cancelled.Load() && ev.errStatus == qwpStatusCancelled { + return + } + yield(nil, eventToError(ev, q.requestId)) + return + case qwpEventKindTransportError: + // Synthesized client-side transport-terminal failure + // — the connection is poisoned and cannot serve more + // frames. Surface as a plain error; the session + // orchestrator (qwp_query_failover.go) intercepts + // this case before it reaches Batches when failover + // is enabled and replay succeeds. + yield(nil, transportEventError(ev)) + return + case qwpEventKindFailoverReset: + // Emitted by the session orchestrator after a + // successful reconnect-and-replay. Yield as a + // non-fatal error so the caller can detect via + // errors.As and discard accumulated state, then + // continue iterating to consume the new generation's + // batches. ev.failoverReset is always non-nil for + // this kind. + if !yield(nil, ev.failoverReset) { + q.cancelAndDrainOnCleanupCtx() + return + } + case qwpEventKindExecDone: + // Wrong statement kind: user ran a non-SELECT via + // Query. Surface with a typed error so they can + // switch to Exec. + yield(nil, fmt.Errorf( + "qwp query: Query called on a non-SELECT statement; use Exec instead")) + return + default: + yield(nil, fmt.Errorf("qwp query: unexpected event kind %d", ev.kind)) + return + } + } + } +} + +// TotalRows returns the server-reported total-row count from the +// RESULT_END frame, or 0 if the query did not reach End (cancelled, +// errored, or still running). Safe to call from any goroutine. +func (q *QwpQuery) TotalRows() int64 { + return q.totalRows.Load() +} + +// RequestId returns the client-assigned id for this query. Exposed +// mainly for test instrumentation and cross-correlating logs with +// server-side request ids. +func (q *QwpQuery) RequestId() int64 { + return q.requestId +} + +// Cancel asks the server to abort the current query. Safe to call +// from any goroutine, including before the first Batches() iteration +// or while another goroutine is ranging over Batches(). A no-op if +// the query has already reached a terminal state. +// +// The cancel is asynchronous: Batches() keeps yielding whatever the +// server has already buffered before it reacts to the CANCEL. The +// server eventually responds with QUERY_ERROR(status=CANCELLED), +// which Batches() swallows silently so a caller-initiated Cancel +// produces a clean end of iteration. +func (q *QwpQuery) Cancel() { + if q.state.Load() == qwpQueryStateDone { + return + } + if q.cancelled.CompareAndSwap(false, true) { + // Route through the session so cancel targets the live + // generation's request_id even after a transparent failover + // reconnect (where the session's currentRequestId diverges + // from q.requestId). + if q.session != nil { + q.session.requestCancel() + } else { + q.client.io().requestCancel(q.requestId) + } + } +} + +// Close finalizes the cursor. Drains any pending events to a +// terminal frame so the underlying I/O dispatcher returns to idle — +// required before the next Query or Exec on the same client. Safe +// to defer even on already-finished queries; the second call is a +// no-op. +// +// Close is also a no-op while a Batches() iteration is in flight on +// another goroutine: the iterator performs its own cancel+drain on +// every exit path, and a concurrent Close would only race it for the +// dispatcher's single terminal event. Use Cancel (or cancel q.ctx) +// to unblock an in-flight iterator from another goroutine. +// +// Does not close the client itself. Call (*QwpQueryClient).Close +// to release the underlying WebSocket connection. +func (q *QwpQuery) Close() { + // CAS Idle→Done claims exclusive cleanup ownership. Failure means + // either a Batches() iteration is running (state=Iterating — it + // will clean up on exit) or the cursor is already Done (prior + // iteration, Close, or submit failure). Both are no-ops here. + if !q.state.CompareAndSwap(qwpQueryStateIdle, qwpQueryStateDone) { + return + } + q.cancelAndDrainOnCleanupCtx() +} + +// cancelAndDrainOnCleanupCtx sends a CANCEL for this query's +// requestId (unless one is already in flight) and drains pending +// events until a terminal frame arrives, so the dispatcher returns +// to idle regardless of q.ctx's state. Uses a fresh bounded context +// because every caller either runs after q.ctx has already been +// observed done (iterator break-out, takeEvent-error) or inside a +// user-driven Close which has no meaningful ctx of its own. +func (q *QwpQuery) cancelAndDrainOnCleanupCtx() { + if q.cancelled.CompareAndSwap(false, true) { + if q.session != nil { + q.session.requestCancel() + } else { + q.client.io().requestCancel(q.requestId) + } + } + cleanupCtx, cancel := context.WithTimeout( + context.Background(), qwpQueryCleanupDrainTimeout) + defer cancel() + _ = drainUntilTerminal(cleanupCtx, q.client.io()) +} diff --git a/qwp_query_client_test.go b/qwp_query_client_test.go new file mode 100644 index 00000000..33b3fa10 --- /dev/null +++ b/qwp_query_client_test.go @@ -0,0 +1,2172 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/binary" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/coder/websocket" +) + +// --- QwpQueryClientFromConf parse tests --- + +func TestQwpQueryClientFromConfHappyPath(t *testing.T) { + cases := []struct { + name string + conf string + chk func(t *testing.T, c *qwpQueryClientConfig) + }{ + { + name: "minimal_ws", + conf: "ws::addr=localhost:9000;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if got := c.addressString(); got != "localhost:9000" { + t.Errorf("addressString=%q", got) + } + if c.endpointPath != qwpReadPath { + t.Errorf("endpointPath=%q", c.endpointPath) + } + if c.tlsMode != tlsDisabled { + t.Errorf("tlsMode=%v", c.tlsMode) + } + if c.bufferPoolSize != qwpDefaultEgressBufferPoolSize { + t.Errorf("bufferPoolSize=%d", c.bufferPoolSize) + } + // zone defaults to unset (zone-blind) and + // auth_timeout_ms to the shared 15s default so a + // connect string omitting them behaves like the + // ingest client. + if c.zone != "" { + t.Errorf("zone=%q, want empty (zone-blind default)", c.zone) + } + if c.authTimeoutMs != qwpDefaultAuthTimeoutMs { + t.Errorf("authTimeoutMs=%d, want %d", + c.authTimeoutMs, qwpDefaultAuthTimeoutMs) + } + }, + }, + { + // failover.md §1.1 common keys: a connect string shared + // verbatim with the ingest client must parse here too + // (the ingest side accepts both; the query side is where + // zone= is actually effective). + name: "zone_and_auth_timeout", + conf: "ws::addr=db.example:9000;zone=eu-west-1a;" + + "auth_timeout_ms=2500;target=replica;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.zone != "eu-west-1a" { + t.Errorf("zone=%q, want eu-west-1a", c.zone) + } + if c.authTimeoutMs != 2500 { + t.Errorf("authTimeoutMs=%d, want 2500", c.authTimeoutMs) + } + if c.target != qwpTargetReplica { + t.Errorf("target=%v, want replica", c.target) + } + }, + }, + { + name: "wss_enables_tls", + conf: "wss::addr=db.example:9000;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.tlsMode != tlsEnabled { + t.Errorf("tlsMode=%v, want tlsEnabled", c.tlsMode) + } + }, + }, + { + name: "all_keys", + conf: "wss::addr=db.example:9443;path=/read/v2;" + + "username=bob;password=hunter2;" + + "client_id=dashboard/1.0;" + + "buffer_pool_size=8;max_batch_rows=50000;" + + "initial_credit=131072;" + + "tls_verify=unsafe_off;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if got := c.addressString(); got != "db.example:9443" { + t.Errorf("addressString=%q", got) + } + if c.endpointPath != "/read/v2" { + t.Errorf("endpointPath=%q", c.endpointPath) + } + if c.httpUser != "bob" || c.httpPass != "hunter2" { + t.Errorf("basic auth user/pass = %q/%q", c.httpUser, c.httpPass) + } + if c.clientID != "dashboard/1.0" { + t.Errorf("clientID=%q", c.clientID) + } + if c.bufferPoolSize != 8 { + t.Errorf("bufferPoolSize=%d", c.bufferPoolSize) + } + if c.maxBatchRows != 50000 { + t.Errorf("maxBatchRows=%d", c.maxBatchRows) + } + if c.initialCredit != 131072 { + t.Errorf("initialCredit=%d", c.initialCredit) + } + if c.tlsMode != tlsInsecureSkipVerify { + t.Errorf("tlsMode=%v, want insecureSkipVerify", c.tlsMode) + } + }, + }, + { + name: "auth_header", + conf: "ws::addr=a:1;auth=Bearer abc;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.authorization != "Bearer abc" { + t.Errorf("authorization=%q", c.authorization) + } + if got := c.effectiveAuthorization(); got != "Bearer abc" { + t.Errorf("effectiveAuthorization=%q", got) + } + }, + }, + { + name: "bearer_token", + conf: "ws::addr=a:1;token=xyz;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.httpToken != "xyz" { + t.Errorf("httpToken=%q", c.httpToken) + } + if got := c.effectiveAuthorization(); got != "Bearer xyz" { + t.Errorf("effectiveAuthorization=%q", got) + } + }, + }, + { + name: "basic_auth_encoded", + conf: "ws::addr=a:1;username=u;password=p;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + want := "Basic " + base64.StdEncoding.EncodeToString([]byte("u:p")) + if got := c.effectiveAuthorization(); got != want { + t.Errorf("effectiveAuthorization=%q, want %q", got, want) + } + }, + }, + { + name: "compression_default_is_raw", + conf: "ws::addr=a:1;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.compression != qwpCompressionRaw { + t.Errorf("compression=%q, want raw", c.compression) + } + if c.compressionLevel != qwpDefaultCompressionLevel { + t.Errorf("compressionLevel=%d, want %d", + c.compressionLevel, qwpDefaultCompressionLevel) + } + if got := c.buildAcceptEncodingHeader(); got != "" { + t.Errorf("accept-encoding header=%q, want empty (raw)", got) + } + }, + }, + { + name: "compression_zstd_builds_header", + conf: "ws::addr=a:1;compression=zstd;compression_level=7;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.compression != qwpCompressionZstd { + t.Errorf("compression=%q, want zstd", c.compression) + } + if c.compressionLevel != 7 { + t.Errorf("compressionLevel=%d, want 7", c.compressionLevel) + } + if got := c.buildAcceptEncodingHeader(); got != "zstd;level=7,raw" { + t.Errorf("accept-encoding=%q, want %q", + got, "zstd;level=7,raw") + } + }, + }, + { + name: "compression_auto_also_advertises_zstd", + conf: "ws::addr=a:1;compression=auto;", + chk: func(t *testing.T, c *qwpQueryClientConfig) { + if c.compression != qwpCompressionAuto { + t.Errorf("compression=%q, want auto", c.compression) + } + // "auto" advertises the same header value as "zstd"; + // the server picks. Level defaults to 1 + // (qwpDefaultCompressionLevel; Sender.java parity and + // connect-string.md §Query client keys). + if got := c.buildAcceptEncodingHeader(); got != "zstd;level=1,raw" { + t.Errorf("accept-encoding=%q, want %q", + got, "zstd;level=1,raw") + } + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + c, err := parseQwpQueryConf(tc.conf) + if err != nil { + t.Fatalf("parse: %v", err) + } + tc.chk(t, c) + }) + } +} + +func TestQwpQueryClientFromConfErrors(t *testing.T) { + cases := []struct { + name string + conf string + wantSub string + }{ + {"bad_schema", "http::addr=a:1;", "invalid schema"}, + {"bad_buffer_pool", "ws::addr=a:1;buffer_pool_size=abc;", "invalid buffer_pool_size"}, + {"buffer_pool_zero", "ws::addr=a:1;buffer_pool_size=0;", "buffer pool size must be >= 1"}, + {"max_batch_rows_negative", "ws::addr=a:1;max_batch_rows=-1;", "max batch rows must be >= 0"}, + {"max_batch_rows_too_big", "ws::addr=a:1;max_batch_rows=99999999;", "exceeds client cap"}, + {"mutually_exclusive_auth", "ws::addr=a:1;auth=X;token=Y;", "mutually exclusive"}, + {"basic_missing_password", "ws::addr=a:1;username=u;", "both username and password"}, + {"unknown_key", "ws::addr=a:1;weird=1;", "unsupported option"}, + {"tls_on_ws", "ws::addr=a:1;tls_verify=on;", "tls_verify requires"}, + {"tls_bad", "wss::addr=a:1;tls_verify=off;", "invalid tls_verify"}, + {"tls_roots_rejected", "wss::addr=a:1;tls_roots=/tmp/foo;", "tls_roots is not available"}, + {"compression_unsupported_value", "ws::addr=a:1;compression=lzma;", "invalid compression"}, + {"compression_level_non_numeric", "ws::addr=a:1;compression=zstd;compression_level=seven;", "invalid compression_level"}, + {"compression_level_too_low", "ws::addr=a:1;compression=zstd;compression_level=0;", "compression level must be in [1, 22]"}, + {"compression_level_too_high", "ws::addr=a:1;compression=zstd;compression_level=23;", "compression level must be in [1, 22]"}, + {"server_info_timeout_zero", "ws::addr=a:1;server_info_timeout_ms=0;", "server_info_timeout_ms must be > 0"}, + {"server_info_timeout_negative", "ws::addr=a:1;server_info_timeout_ms=-1;", "server_info_timeout_ms must be > 0"}, + {"failover_max_duration_negative", "ws::addr=a:1;failover_max_duration_ms=-1;", "failover_max_duration_ms must be >= 0"}, + {"failover_max_duration_non_numeric", "ws::addr=a:1;failover_max_duration_ms=soon;", "invalid failover_max_duration_ms"}, + {"auth_timeout_non_numeric", "ws::addr=a:1;auth_timeout_ms=soon;", "invalid auth_timeout_ms"}, + {"auth_timeout_zero", "ws::addr=a:1;auth_timeout_ms=0;", "auth_timeout_ms must be > 0"}, + {"auth_timeout_negative", "ws::addr=a:1;auth_timeout_ms=-1;", "auth_timeout_ms must be > 0"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := parseQwpQueryConf(tc.conf) + if err == nil { + t.Fatalf("expected error for %q", tc.conf) + } + if !strings.Contains(err.Error(), tc.wantSub) { + t.Errorf("err=%v, want substring %q", err, tc.wantSub) + } + }) + } +} + +// TestQwpQueryClientFromConfPortBoundaries pins the addr= port-range +// validation: ports outside [1, 65535] and non-numeric ports are +// rejected at parse time so the user sees an actionable error rather +// than an opaque dial failure later. Ports of 1 and 65535 are accepted. +// Mirrors the Java QwpQueryClientFromConfigTest port-boundary tests. +func TestQwpQueryClientFromConfPortBoundaries(t *testing.T) { + t.Run("Reject", func(t *testing.T) { + cases := []struct { + conf string + wantSub string + }{ + {"ws::addr=db:0;", "out of range"}, + {"ws::addr=db:-1;", "out of range"}, + {"ws::addr=db:65536;", "out of range"}, + {"ws::addr=db:2147483647;", "out of range"}, + {"ws::addr=host:abc;", "invalid port"}, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + _, err := parseQwpQueryConf(tc.conf) + if err == nil { + t.Fatalf("expected error for %q", tc.conf) + } + if !strings.Contains(err.Error(), tc.wantSub) { + t.Errorf("err=%v, want substring %q", err, tc.wantSub) + } + }) + } + }) + t.Run("AcceptBoundaries", func(t *testing.T) { + // 1 and 65535 are the inclusive boundaries of the legal range. + // "addr=host" with no port is also legal — the URL scheme + // supplies a default port at dial time. + for _, conf := range []string{ + "ws::addr=db:1;", + "ws::addr=db:65535;", + "ws::addr=db.internal;", + } { + if _, err := parseQwpQueryConf(conf); err != nil { + t.Errorf("unexpected error for %q: %v", conf, err) + } + } + }) +} + +// TestQwpQueryClientFromConfIPv6 pins the bracketed-IPv6 and bare-IPv6 +// parsing paths in the addr= validator. The validator accepts: +// - bracketed with port: [::1]:9000 +// - bracketed without port: [fe80::1] +// - bare IPv6 (>= 2 colons): fe80::1 (no port; brackets required for port) +// +// And rejects: +// - empty bracketed host: [] :9000 +// - missing closing ']': [::1:9000 +// - trailing garbage after ']': [::1]9000 +// +// Mirrors the Java QwpQueryClientFromConfigTest IPv6 cases. The Go +// client targets a single endpoint; the comma-separated multi-address +// form Java accepts is rejected up front (see TestRejectsMultiAddress). +func TestQwpQueryClientFromConfIPv6(t *testing.T) { + t.Run("Accept", func(t *testing.T) { + for _, conf := range []string{ + "ws::addr=[::1]:9000;", + "ws::addr=[fe80::1];", + "ws::addr=[::1];", + "ws::addr=fe80::1;", // bare IPv6, default port + } { + t.Run(conf, func(t *testing.T) { + if _, err := parseQwpQueryConf(conf); err != nil { + t.Errorf("unexpected error for %q: %v", conf, err) + } + }) + } + }) + t.Run("Reject", func(t *testing.T) { + cases := []struct { + conf string + wantSub string + }{ + {"ws::addr=[]:9000;", "empty host"}, + {"ws::addr=[::1:9000;", "missing closing"}, + {"ws::addr=[::1]9000;", "expected ':'"}, + {"ws::addr=[::1]:0;", "out of range"}, + {"ws::addr=[::1]:65536;", "out of range"}, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + _, err := parseQwpQueryConf(tc.conf) + if err == nil { + t.Fatalf("expected error for %q", tc.conf) + } + if !strings.Contains(err.Error(), tc.wantSub) { + t.Errorf("err=%v, want substring %q", err, tc.wantSub) + } + }) + } + }) +} + +// TestQwpQueryClientFromConfAcceptsMultiAddress verifies that +// comma-separated addr= entries become an ordered endpoint list. The +// connect walk in qwp_query_failover.go consumes them in order; the +// parser's responsibility is just shape validation here. +func TestQwpQueryClientFromConfAcceptsMultiAddress(t *testing.T) { + cases := []struct { + conf string + wantHosts []string + wantPorts []int + }{ + { + conf: "ws::addr=a:9000,b:9001;", + wantHosts: []string{"a", "b"}, + wantPorts: []int{9000, 9001}, + }, + { + conf: "ws::addr=a:9000,b:9000,c:9000;", + wantHosts: []string{"a", "b", "c"}, + wantPorts: []int{9000, 9000, 9000}, + }, + { + conf: "ws::addr=[::1]:9000,[fe80::1]:9001;", + wantHosts: []string{"::1", "fe80::1"}, + wantPorts: []int{9000, 9001}, + }, + } + for _, tc := range cases { + t.Run(tc.conf, func(t *testing.T) { + cfg, err := parseQwpQueryConf(tc.conf) + if err != nil { + t.Fatalf("parseQwpQueryConf: %v", err) + } + if len(cfg.endpoints) != len(tc.wantHosts) { + t.Fatalf("len(endpoints) = %d, want %d", len(cfg.endpoints), len(tc.wantHosts)) + } + for i, ep := range cfg.endpoints { + if ep.host != tc.wantHosts[i] || ep.port != tc.wantPorts[i] { + t.Errorf("endpoints[%d] = %s:%d, want %s:%d", + i, ep.host, ep.port, tc.wantHosts[i], tc.wantPorts[i]) + } + } + }) + } +} + +// TestQwpQueryClientFromConfV2KeysParse verifies the v2 connection- +// string keys (target, failover, failover_max_attempts, +// failover_backoff_initial_ms, failover_backoff_max_ms, +// failover_max_duration_ms, server_info_timeout_ms, replay_exec) +// parse into the expected config fields and reject malformed values +// with actionable errors. +func TestQwpQueryClientFromConfV2KeysParse(t *testing.T) { + t.Run("happy_path", func(t *testing.T) { + conf := "ws::addr=a:9000;target=primary;failover=off;" + + "failover_max_attempts=3;failover_backoff_initial_ms=10;" + + "failover_backoff_max_ms=200;failover_max_duration_ms=1500;" + + "server_info_timeout_ms=750;replay_exec=on;" + cfg, err := parseQwpQueryConf(conf) + if err != nil { + t.Fatalf("parseQwpQueryConf: %v", err) + } + if cfg.target != qwpTargetPrimary { + t.Errorf("target=%v, want primary", cfg.target) + } + if cfg.failoverEnabled { + t.Errorf("failoverEnabled=true, want false") + } + if cfg.failoverMaxAttempts != 3 { + t.Errorf("failoverMaxAttempts=%d, want 3", cfg.failoverMaxAttempts) + } + if cfg.failoverBackoffInitial != 10*time.Millisecond { + t.Errorf("failoverBackoffInitial=%v, want 10ms", cfg.failoverBackoffInitial) + } + if cfg.failoverBackoffMax != 200*time.Millisecond { + t.Errorf("failoverBackoffMax=%v, want 200ms", cfg.failoverBackoffMax) + } + if cfg.failoverMaxDuration != 1500*time.Millisecond { + t.Errorf("failoverMaxDuration=%v, want 1500ms", cfg.failoverMaxDuration) + } + if cfg.serverInfoTimeout != 750*time.Millisecond { + t.Errorf("serverInfoTimeout=%v, want 750ms", cfg.serverInfoTimeout) + } + if !cfg.replayExec { + t.Errorf("replayExec=false, want true") + } + }) + + t.Run("invalid_target", func(t *testing.T) { + _, err := parseQwpQueryConf("ws::addr=a:9000;target=leader;") + if err == nil || !strings.Contains(err.Error(), "target") { + t.Errorf("err=%v, want target validation error", err) + } + }) + + t.Run("invalid_failover", func(t *testing.T) { + _, err := parseQwpQueryConf("ws::addr=a:9000;failover=maybe;") + if err == nil || !strings.Contains(err.Error(), "failover") { + t.Errorf("err=%v, want failover validation error", err) + } + }) + + t.Run("backoff_max_lt_initial", func(t *testing.T) { + _, err := parseQwpQueryConf( + "ws::addr=a:9000;failover_backoff_initial_ms=100;" + + "failover_backoff_max_ms=10;") + if err == nil || !strings.Contains(err.Error(), "failover_backoff_max") { + t.Errorf("err=%v, want max-lt-initial error", err) + } + }) + + t.Run("failover_max_duration_default", func(t *testing.T) { + cfg, err := parseQwpQueryConf("ws::addr=a:9000;") + if err != nil { + t.Fatalf("parseQwpQueryConf: %v", err) + } + if cfg.failoverMaxDuration != qwpDefaultFailoverMaxDuration { + t.Errorf("failoverMaxDuration=%v, want default %v", + cfg.failoverMaxDuration, qwpDefaultFailoverMaxDuration) + } + }) + + t.Run("failover_max_duration_unbounded", func(t *testing.T) { + cfg, err := parseQwpQueryConf( + "ws::addr=a:9000;failover_max_duration_ms=0;") + if err != nil { + t.Fatalf("parseQwpQueryConf: %v", err) + } + if cfg.failoverMaxDuration != 0 { + t.Errorf("failoverMaxDuration=%v, want 0 (unbounded)", + cfg.failoverMaxDuration) + } + }) +} + +// TestQwpQueryClientFromConfTlsVariations exercises the tls_verify +// matrix exhaustively: on/unsafe_off accepted on wss://, both rejected +// on ws://, invalid values rejected, and the legacy tls_roots / +// tls_roots_password keys explicitly rejected on both schemas (the Go +// client uses the system trust store only). Mirrors the Java +// QwpQueryClientFromConfigTest TLS variations. +func TestQwpQueryClientFromConfTlsVariations(t *testing.T) { + type tlsCase struct { + name string + conf string + wantTls tlsMode + wantErrIn string + } + cases := []tlsCase{ + { + name: "wss_no_tls_verify_defaults_to_enabled", + conf: "wss::addr=db:9000;", + wantTls: tlsEnabled, + }, + { + name: "wss_tls_verify_on", + conf: "wss::addr=db:9000;tls_verify=on;", + wantTls: tlsEnabled, + }, + { + name: "wss_tls_verify_unsafe_off", + conf: "wss::addr=db:9000;tls_verify=unsafe_off;", + wantTls: tlsInsecureSkipVerify, + }, + { + name: "ws_no_tls", + conf: "ws::addr=db:9000;", + wantTls: tlsDisabled, + }, + { + name: "ws_tls_verify_on_rejected", + conf: "ws::addr=db:9000;tls_verify=on;", + wantErrIn: "tls_verify requires", + }, + { + name: "ws_tls_verify_unsafe_off_rejected", + conf: "ws::addr=db:9000;tls_verify=unsafe_off;", + wantErrIn: "tls_verify requires", + }, + { + name: "wss_tls_verify_invalid", + conf: "wss::addr=db:9000;tls_verify=strict;", + wantErrIn: "invalid tls_verify", + }, + { + name: "wss_tls_roots_rejected", + conf: "wss::addr=db:9000;tls_roots=/etc/ca.p12;", + wantErrIn: "tls_roots is not available", + }, + { + name: "ws_tls_roots_rejected", + conf: "ws::addr=db:9000;tls_roots=/etc/ca.p12;", + wantErrIn: "tls_roots is not available", + }, + { + name: "tls_roots_password_rejected", + conf: "wss::addr=db:9000;tls_roots_password=secret;", + wantErrIn: "tls_roots_password is not available", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + cfg, err := parseQwpQueryConf(c.conf) + if c.wantErrIn != "" { + if err == nil { + t.Fatalf("expected error containing %q", c.wantErrIn) + } + if !strings.Contains(err.Error(), c.wantErrIn) { + t.Errorf("err=%v, want %q", err, c.wantErrIn) + } + return + } + if err != nil { + t.Fatalf("parse: %v", err) + } + if cfg.tlsMode != c.wantTls { + t.Errorf("tlsMode=%v, want %v", cfg.tlsMode, c.wantTls) + } + }) + } +} + +// TestQwpQueryClientFromConfCompressionVariations exhaustively covers +// the compression and compression_level keys: every accepted value +// (raw / zstd / auto), every boundary on compression_level (1 and 22 +// inclusive), and the rejected values that Java's +// QwpQueryClientFromConfigTest pins. +func TestQwpQueryClientFromConfCompressionVariations(t *testing.T) { + type compCase struct { + name string + conf string + wantCompression string + wantLevel int + wantErrIn string + wantHeaderHasZst bool + } + cases := []compCase{ + { + name: "default_is_raw", + conf: "ws::addr=db:9000;", + wantCompression: qwpCompressionRaw, + wantLevel: qwpDefaultCompressionLevel, + }, + { + name: "zstd_at_lower_bound", + conf: "ws::addr=db:9000;compression=zstd;compression_level=1;", + wantCompression: qwpCompressionZstd, + wantLevel: 1, + wantHeaderHasZst: true, + }, + { + name: "zstd_at_upper_bound", + conf: "ws::addr=db:9000;compression=zstd;compression_level=22;", + wantCompression: qwpCompressionZstd, + wantLevel: 22, + wantHeaderHasZst: true, + }, + { + name: "auto_also_advertises_zstd", + conf: "ws::addr=db:9000;compression=auto;", + wantCompression: qwpCompressionAuto, + wantLevel: qwpDefaultCompressionLevel, + wantHeaderHasZst: true, + }, + { + name: "raw_explicit", + conf: "ws::addr=db:9000;compression=raw;", + wantCompression: qwpCompressionRaw, + wantLevel: qwpDefaultCompressionLevel, + }, + { + name: "level_zero_rejected", + conf: "ws::addr=db:9000;compression_level=0;", + wantErrIn: "must be in [1, 22]", + }, + { + name: "level_negative_rejected", + conf: "ws::addr=db:9000;compression_level=-1;", + wantErrIn: "must be in [1, 22]", + }, + { + name: "level_too_large_rejected", + conf: "ws::addr=db:9000;compression_level=23;", + wantErrIn: "must be in [1, 22]", + }, + { + name: "level_non_numeric_rejected", + conf: "ws::addr=db:9000;compression_level=high;", + wantErrIn: "invalid compression_level", + }, + { + name: "compression_invalid_rejected", + conf: "ws::addr=db:9000;compression=gzip;", + wantErrIn: "invalid compression", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + cfg, err := parseQwpQueryConf(c.conf) + if c.wantErrIn != "" { + if err == nil { + t.Fatalf("expected error containing %q", c.wantErrIn) + } + if !strings.Contains(err.Error(), c.wantErrIn) { + t.Errorf("err=%v, want %q", err, c.wantErrIn) + } + return + } + if err != nil { + t.Fatalf("parse: %v", err) + } + if cfg.compression != c.wantCompression { + t.Errorf("compression=%q, want %q", cfg.compression, c.wantCompression) + } + if cfg.compressionLevel != c.wantLevel { + t.Errorf("compressionLevel=%d, want %d", cfg.compressionLevel, c.wantLevel) + } + h := cfg.buildAcceptEncodingHeader() + if c.wantHeaderHasZst { + if !strings.Contains(h, "zstd") { + t.Errorf("buildAcceptEncodingHeader=%q, want to contain 'zstd'", h) + } + if !strings.Contains(h, "raw") { + t.Errorf("buildAcceptEncodingHeader=%q, want to contain 'raw' fallback", h) + } + } else { + if h != "" { + t.Errorf("buildAcceptEncodingHeader=%q, want empty for raw", h) + } + } + }) + } +} + +// --- Functional options tests --- + +func TestQwpQueryClientOptionsApply(t *testing.T) { + cfg := qwpQueryDefaultConfig() + for _, opt := range []QwpQueryClientOption{ + WithQwpQueryAddress("example:9000"), + WithQwpQueryEndpointPath("/read/v2"), + WithQwpQueryBasicAuth("u", "p"), + WithQwpQueryBufferPoolSize(16), + WithQwpQueryMaxBatchRows(1000), + WithQwpQueryClientID("unit-test/1.0"), + WithQwpQueryInitialCredit(4096), + WithQwpQueryTlsInsecureSkipVerify(), + WithQwpQueryCompression(qwpCompressionZstd), + WithQwpQueryCompressionLevel(9), + WithQwpQueryFailoverMaxDuration(7 * time.Second), + } { + opt(cfg) + } + if got := cfg.addressString(); got != "example:9000" { + t.Errorf("addressString=%q", got) + } + if cfg.endpointPath != "/read/v2" { + t.Errorf("endpointPath=%q", cfg.endpointPath) + } + if cfg.httpUser != "u" || cfg.httpPass != "p" { + t.Errorf("basic=%q/%q", cfg.httpUser, cfg.httpPass) + } + if cfg.bufferPoolSize != 16 { + t.Errorf("bufferPoolSize=%d", cfg.bufferPoolSize) + } + if cfg.maxBatchRows != 1000 { + t.Errorf("maxBatchRows=%d", cfg.maxBatchRows) + } + if cfg.clientID != "unit-test/1.0" { + t.Errorf("clientID=%q", cfg.clientID) + } + if cfg.initialCredit != 4096 { + t.Errorf("initialCredit=%d", cfg.initialCredit) + } + if cfg.tlsMode != tlsInsecureSkipVerify { + t.Errorf("tlsMode=%v", cfg.tlsMode) + } + if cfg.compression != qwpCompressionZstd { + t.Errorf("compression=%q", cfg.compression) + } + if cfg.compressionLevel != 9 { + t.Errorf("compressionLevel=%d", cfg.compressionLevel) + } + if got := cfg.buildAcceptEncodingHeader(); got != "zstd;level=9,raw" { + t.Errorf("accept-encoding=%q", got) + } + if cfg.failoverMaxDuration != 7*time.Second { + t.Errorf("failoverMaxDuration=%v, want 7s", cfg.failoverMaxDuration) + } +} + +// --- Mock server integration tests for the public API --- + +// newMockQueryClient stands up the egress mock server, dials it with a +// QwpQueryClient, and returns the client + cleanup. handler drives the +// test-side choreography. +func newMockQueryClient( + t *testing.T, + bufferPoolSize int, + handler func(*qwpMockEgressConn), +) (*QwpQueryClient, func()) { + t.Helper() + srv := newQwpMockEgressServer(t, handler) + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") // httptest.NewServer → http:// + addr := strings.TrimPrefix(wsURL, "ws://") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + poolOpts := []QwpQueryClientOption{WithQwpQueryAddress(addr)} + if bufferPoolSize > 0 { + poolOpts = append(poolOpts, WithQwpQueryBufferPoolSize(bufferPoolSize)) + } + c, err := NewQwpQueryClient(ctx, poolOpts...) + if err != nil { + srv.Close() + t.Fatalf("NewQwpQueryClient: %v", err) + } + cleanup := func() { + closeCtx, closeCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer closeCancel() + _ = c.Close(closeCtx) + srv.Close() + } + return c, cleanup +} + +// TestQwpQueryHappyPath drives two batches + RESULT_END through the +// public Query cursor and verifies Batches() yields them in order, +// TotalRows() matches, and no error leaks. +func TestQwpQueryHappyPath(t *testing.T) { + const wantSQL = "SELECT * FROM trades" + c, cleanup := newMockQueryClient(t, 4, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, sql, _ := parseQueryRequest(t, req) + if sql != wantSQL { + t.Errorf("server sql=%q, want %q", sql, wantSQL) + } + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 10)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 1, "v", 20)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 1, 2))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, wantSQL) + defer q.Close() + + var got []int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iterator error: %v", err) + } + got = append(got, batch.Int64(0, 0)) + } + if len(got) != 2 || got[0] != 10 || got[1] != 20 { + t.Fatalf("rows=%v, want [10 20]", got) + } + if q.TotalRows() != 2 { + t.Errorf("TotalRows=%d, want 2", q.TotalRows()) + } +} + +// TestQwpQueryRequestIdsAreMonotonic runs two queries in sequence on +// the same client and verifies the client-assigned requestIds tick up +// by one, starting at 1 (matches Java nextRequestId initialization). +func TestQwpQueryRequestIdsAreMonotonic(t *testing.T) { + seenIDs := make(chan int64, 4) + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + for i := 0; i < 2; i++ { + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + seenIDs <- reqID + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + } + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + for i := 0; i < 2; i++ { + q := c.Query(ctx, "SELECT 1") + for _, err := range q.Batches() { + if err != nil { + t.Fatalf("batch err: %v", err) + } + } + q.Close() + } + close(seenIDs) + var ids []int64 + for id := range seenIDs { + ids = append(ids, id) + } + if len(ids) != 2 || ids[0] != 1 || ids[1] != 2 { + t.Errorf("requestIds=%v, want [1 2]", ids) + } +} + +// TestQwpQueryServerErrorSurfacesAsQwpQueryError verifies the +// iterator yields a *QwpQueryError with the server's status and +// message on a QUERY_ERROR frame. +func TestQwpQueryServerErrorSurfacesAsQwpQueryError(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID, byte(QwpStatusParseError), "bad sql", -1))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "NONSENSE") + defer q.Close() + + var lastErr error + var batches int + for _, err := range q.Batches() { + if err != nil { + lastErr = err + continue + } + batches++ + } + if batches != 0 { + t.Errorf("batches=%d, want 0", batches) + } + if lastErr == nil { + t.Fatal("expected iterator error, got nil") + } + var qe *QwpQueryError + if !errors.As(lastErr, &qe) { + t.Fatalf("err type=%T, want *QwpQueryError: %v", lastErr, lastErr) + } + if qe.Status != QwpStatusParseError { + t.Errorf("Status=0x%02X, want 0x%02X", byte(qe.Status), byte(QwpStatusParseError)) + } + if qe.Message != "bad sql" { + t.Errorf("Message=%q", qe.Message) + } +} + +// TestQwpQueryOnNonSelectSurfacesError verifies that running Query on +// a non-SELECT statement surfaces the misuse as an error on the +// iterator (server sent EXEC_DONE where we expected RESULT_BATCHes). +func TestQwpQueryOnNonSelectSurfacesError(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(reqID, 0x04, 99))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "INSERT INTO x VALUES (1)") + defer q.Close() + + var lastErr error + for _, err := range q.Batches() { + if err != nil { + lastErr = err + } + } + if lastErr == nil { + t.Fatal("expected iterator error for Query-on-non-SELECT") + } + if !strings.Contains(lastErr.Error(), "non-SELECT") { + t.Errorf("error = %v, want contains 'non-SELECT'", lastErr) + } +} + +// TestQwpQueryBreakOutSendsCancel verifies that breaking out of the +// range loop early sends a CANCEL frame to the server and drains to +// the server's CANCELLED echo cleanly. +func TestQwpQueryBreakOutSendsCancel(t *testing.T) { + cancelSeen := make(chan int64, 1) + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 42)) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + cancelSeen <- int64(binary.LittleEndian.Uint64(frame[1:])) + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID, byte(qwpStatusCancelled), "cancelled", -1))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1") + defer q.Close() + + var saw int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("unexpected iterator error: %v", err) + } + saw = batch.Int64(0, 0) + break // trigger cancel + } + if saw != 42 { + t.Errorf("saw=%d, want 42", saw) + } + select { + case gotID := <-cancelSeen: + if gotID != q.RequestId() { + t.Errorf("cancel id=%d, want %d", gotID, q.RequestId()) + } + case <-time.After(2 * time.Second): + t.Fatal("server never saw CANCEL") + } +} + +// TestQwpQueryCancelBeforeIterate verifies that calling Cancel before +// iterating sends a CANCEL frame and the iterator exits cleanly on +// the server's CANCELLED echo (no error yielded). +func TestQwpQueryCancelBeforeIterate(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + // Wait for CANCEL. + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID, byte(qwpStatusCancelled), "cancelled", -1))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1") + defer q.Close() + + q.Cancel() + + var sawErr error + var batches int + for _, err := range q.Batches() { + if err != nil { + sawErr = err + } else { + batches++ + } + } + if sawErr != nil { + t.Errorf("iterator err=%v, want clean end", sawErr) + } + if batches != 0 { + t.Errorf("got %d batches, want 0", batches) + } +} + +// TestQwpExecHappyPath runs an Exec and expects the ExecResult parsed +// from an EXEC_DONE frame. +func TestQwpExecHappyPath(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(reqID, 0x07, 42))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + res, err := c.Exec(ctx, "INSERT INTO x VALUES (1)") + if err != nil { + t.Fatalf("Exec: %v", err) + } + if res.OpType != 0x07 { + t.Errorf("OpType=0x%02X, want 0x07", res.OpType) + } + if res.RowsAffected != 42 { + t.Errorf("RowsAffected=%d, want 42", res.RowsAffected) + } +} + +// TestQwpExecServerErrorReturnsQwpQueryError verifies that a +// QUERY_ERROR on Exec surfaces as *QwpQueryError. +func TestQwpExecServerErrorReturnsQwpQueryError(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID, byte(QwpStatusInternalError), "boom", -1))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, err := c.Exec(ctx, "DROP TABLE nonexistent") + if err == nil { + t.Fatal("expected error") + } + var qe *QwpQueryError + if !errors.As(err, &qe) { + t.Fatalf("err type=%T, want *QwpQueryError", err) + } + if qe.Status != QwpStatusInternalError || qe.Message != "boom" { + t.Errorf("err=%+v", qe) + } +} + +// TestQwpExecOnSelectSurfacesMisuse verifies that running Exec on a +// SELECT (which returns RESULT_BATCH / RESULT_END) surfaces as an +// error explaining the caller should use Query instead. We also +// verify the buffer gets released (exec returned once terminal). +func TestQwpExecOnSelectSurfacesMisuse(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 1)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 1))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, err := c.Exec(ctx, "SELECT 1") + if err == nil { + t.Fatal("expected misuse error") + } + if !strings.Contains(err.Error(), "SELECT-style") { + t.Errorf("err=%v, want contains 'SELECT-style'", err) + } +} + +// TestQwpQueryRejectsOversizedSql verifies that buildRequest's +// preflight blocks SQL text exceeding the spec §16 1 MiB limit +// before any bytes leave the process. Both Query (iterator-yielded +// error) and Exec (sync error) surface a typed length-limit message. +func TestQwpQueryRejectsOversizedSql(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + // Hold the connection open until the client tears it down; + // preflight rejects the SQL before any frame leaves the + // client, so this read must never return data. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + }) + defer cleanup() + + oversized := strings.Repeat("a", qwpMaxSqlTextBytes+1) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + q := c.Query(ctx, oversized) + defer q.Close() + var queryErr error + for _, err := range q.Batches() { + if err != nil { + queryErr = err + } + } + if queryErr == nil { + t.Fatal("expected oversized-SQL error from Query") + } + if !strings.Contains(queryErr.Error(), "exceeds") || + !strings.Contains(queryErr.Error(), "1048576") { + t.Errorf("Query err=%v, want size-limit message", queryErr) + } + + _, execErr := c.Exec(ctx, oversized) + if execErr == nil { + t.Fatal("expected oversized-SQL error from Exec") + } + if !strings.Contains(execErr.Error(), "exceeds") || + !strings.Contains(execErr.Error(), "1048576") { + t.Errorf("Exec err=%v, want size-limit message", execErr) + } +} + +// TestQwpQueryPoolBackpressureAcrossIterator wires a pool=1 client to +// a server that emits 3 batches + End. Public Batches() iterator must +// still surface all batches in order — auto-release per iteration +// keeps the pool alive. +func TestQwpQueryPoolBackpressureAcrossIterator(t *testing.T) { + c, cleanup := newMockQueryClient(t, 1, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 100)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 1, "v", 200)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 2, "v", 300)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 2, 3))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT v FROM t") + defer q.Close() + + var got []int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + got = append(got, batch.Int64(0, 0)) + } + if len(got) != 3 || got[0] != 100 || got[1] != 200 || got[2] != 300 { + t.Fatalf("got=%v, want [100 200 300]", got) + } + if q.TotalRows() != 3 { + t.Errorf("TotalRows=%d, want 3", q.TotalRows()) + } +} + +// TestQwpQueryClientCloseTwiceOK verifies Close is idempotent. +func TestQwpQueryClientCloseTwiceOK(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + // Keep the connection alive until the client tears it down. + // An immediate return triggers the server-side CloseNow + // before the client even submits, and races the client's + // own close into an EOF. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + }) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, WithQwpQueryAddress(addr)) + if err != nil { + t.Fatalf("ctor: %v", err) + } + if err := c.Close(ctx); err != nil { + t.Fatalf("close 1: %v", err) + } + if err := c.Close(ctx); err != nil { + t.Fatalf("close 2: %v", err) + } +} + +// TestQwpQueryClientCloseShortCtxNoReaderRace guards M2: Close(ctx) with +// an already-cancelled ctx must not race the reader goroutine over the +// transport's conn. shutdown(ctx) returns via ctx.Done() before doneCh +// fires (the reader has not joined), so the transport teardown that +// follows runs while the reader is still live inside readerRun. The +// reader re-reads io.transport.conn every loop iteration; the teardown +// must not mutate that field out from under it. Run under -race (CI uses +// `go test -race`): before the fix this trips the detector on +// io.transport.conn — readerRun's per-iteration field read vs close()'s +// t.conn=nil write — and can nil-deref the unsupervised reader goroutine. +func TestQwpQueryClientCloseShortCtxNoReaderRace(t *testing.T) { + // Server streams stray text frames as fast as it can and drains its + // own reads concurrently so the client's close handshake completes + // promptly. readerRun reads io.transport.conn every iteration, skips + // non-binary frames, and loops — so the reader goroutine spins on + // that field read while the close lands. + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + go func() { + for { + if _, _, err := m.conn.Read(context.Background()); err != nil { + return + } + } + }() + for { + if err := m.conn.Write(context.Background(), websocket.MessageText, []byte("x")); err != nil { + return + } + } + }) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + + // Repeat: each round stands up a fresh generation whose reader spins + // on io.transport.conn, then closes it with an already-cancelled ctx. + // shutdown(ctx) returns via ctx.Done() before doneCh fires (the reader + // has not joined), so the pre-fix unconditional tr.close() nils + // io.transport.conn concurrently with the still-spinning reader — a + // data race the detector flags within a few rounds. + for i := 0; i < 40; i++ { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + c, err := NewQwpQueryClient(ctx, WithQwpQueryAddress(addr)) + cancel() + if err != nil { + t.Fatalf("round %d ctor: %v", i, err) + } + // Let the reader reach its read-skip loop and spin on the conn + // field before the close writes it. + time.Sleep(2 * time.Millisecond) + closeCtx, closeCancel := context.WithCancel(context.Background()) + closeCancel() + _ = c.Close(closeCtx) + } +} + +// TestQwpQueryOnClosedClient verifies that Query/Exec on a closed +// client surface an error instead of dialing a stale transport. +func TestQwpQueryOnClosedClient(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + // Keep the connection alive until the client tears it down. + // An immediate return triggers the server-side CloseNow + // before the client even submits, and races the client's + // own close into an EOF. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _, _, _ = m.conn.Read(ctx) + }) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, WithQwpQueryAddress(addr)) + if err != nil { + t.Fatalf("ctor: %v", err) + } + _ = c.Close(ctx) + + // Query: error should surface on first iteration. + q := c.Query(ctx, "SELECT 1") + var gotErr error + for _, err := range q.Batches() { + if err != nil { + gotErr = err + } + } + if gotErr == nil || !strings.Contains(gotErr.Error(), "closed") { + t.Errorf("Query on closed client err=%v, want 'closed' substring", gotErr) + } + + // Exec: sync error. + if _, err := c.Exec(ctx, "DROP TABLE X"); err == nil || + !strings.Contains(err.Error(), "closed") { + t.Errorf("Exec on closed client err=%v", err) + } +} + +// TestQwpQueryClientSendsEgressHeaders verifies that max_batch_rows +// and the X-QWP-Accept-Encoding header omission (step-9 deferral) +// propagate through the public client to the upgrade request. +func TestQwpQueryClientSendsEgressHeaders(t *testing.T) { + var sawMaxBatchRows string + var sawAcceptEnc string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawMaxBatchRows = r.Header.Get(qwpHeaderMaxBatchRows) + sawAcceptEnc = r.Header.Get(qwpHeaderAcceptEncoding) + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + // The egress client reads SERVER_INFO during connect; emit one + // so the upgrade-header assertions below are reached. + info := buildServerInfoFrame(qwpVersion, 0, qwpRolePrimary, 1, 0, + 1_700_000_000_000_000_000, "test-cluster", "mock-node") + _ = conn.Write(r.Context(), websocket.MessageBinary, info) + })) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(addr), + WithQwpQueryMaxBatchRows(1234), + ) + if err != nil { + t.Fatalf("ctor: %v", err) + } + defer c.Close(ctx) + + if sawMaxBatchRows != "1234" { + t.Errorf("X-QWP-Max-Batch-Rows=%q, want 1234", sawMaxBatchRows) + } + if sawAcceptEnc != "" { + t.Errorf("X-QWP-Accept-Encoding=%q, want empty (default compression=raw omits the header)", sawAcceptEnc) + } +} + +// TestQwpQueryClientSendsAcceptEncodingWhenCompressed covers the +// compression opt-in path. When the user sets compression to "zstd" +// or "auto", the client advertises zstd in the upgrade handshake; +// "raw" (default, covered above) omits the header entirely. +func TestQwpQueryClientSendsAcceptEncodingWhenCompressed(t *testing.T) { + cases := []struct { + name string + opts []QwpQueryClientOption + wantAE string + }{ + { + // qwpDefaultCompressionLevel = 1 per Sender.java and + // connect-string.md §Query client keys ("Default `1`"). + name: "zstd_default_level", + opts: []QwpQueryClientOption{ + WithQwpQueryCompression(qwpCompressionZstd), + }, + wantAE: "zstd;level=1,raw", + }, + { + name: "zstd_explicit_level", + opts: []QwpQueryClientOption{ + WithQwpQueryCompression(qwpCompressionZstd), + WithQwpQueryCompressionLevel(7), + }, + wantAE: "zstd;level=7,raw", + }, + { + name: "auto_also_advertises_zstd", + opts: []QwpQueryClientOption{ + WithQwpQueryCompression(qwpCompressionAuto), + }, + wantAE: "zstd;level=1,raw", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var sawAE string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawAE = r.Header.Get(qwpHeaderAcceptEncoding) + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + // The egress client reads SERVER_INFO during connect; + // emit one so the header assertion below is reached. + info := buildServerInfoFrame(qwpVersion, 0, qwpRolePrimary, 1, 0, + 1_700_000_000_000_000_000, "test-cluster", "mock-node") + _ = conn.Write(r.Context(), websocket.MessageBinary, info) + })) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + opts := append([]QwpQueryClientOption{WithQwpQueryAddress(addr)}, tc.opts...) + c, err := NewQwpQueryClient(ctx, opts...) + if err != nil { + t.Fatalf("ctor: %v", err) + } + defer c.Close(ctx) + if sawAE != tc.wantAE { + t.Errorf("X-QWP-Accept-Encoding=%q, want %q", sawAE, tc.wantAE) + } + }) + } +} + +// TestQwpQueryCloseAfterCtxCancel exercises the close-path drain +// fix: a break-out from the iterator after the query's ctx has been +// cancelled must still drain the dispatcher to idle so a follow-up +// Query on the same client works. With the pre-fix behavior the +// iterator's break-out drain would return ctx.Err() immediately, +// strand the server's CANCELLED echo in the events channel, and the +// next query's takeEvent would pick up that stale error. +func TestQwpQueryCloseAfterCtxCancel(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + // Query 1: one batch, wait for CANCEL, respond with CANCELLED echo. + req1 := m.readBinary(ctx) + reqID1, _, _ := parseQueryRequest(t, req1) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID1, 0, "v", 1)) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID1, byte(qwpStatusCancelled), "cancelled", -1))) + // Query 2: one batch + RESULT_END. Proves the dispatcher + // returned to idle after query 1's drain. + req2 := m.readBinary(ctx) + reqID2, _, _ := parseQueryRequest(t, req2) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID2, 0, "v", 2)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID2, 0, 1))) + }) + defer cleanup() + + // Query 1: iterate one batch, cancel ctx, break out. + ctx1, cancel1 := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel1() // belt-and-braces so vet sees every return path cancel + q1 := c.Query(ctx1, "SELECT 1") + var saw1 int64 + for b, err := range q1.Batches() { + if err != nil { + t.Fatalf("iter1 err: %v", err) + } + saw1 = b.Int64(0, 0) + cancel1() // kill q1.ctx while iterating — exercises the drain path + break + } + if saw1 != 1 { + t.Fatalf("saw1=%d, want 1", saw1) + } + q1.Close() // no-op: break-out already set done=true via the deferred Store + + // Query 2 must succeed — dispatcher is idle iff the break-out + // drain on query 1 used a cleanup ctx (not the dead q.ctx). + ctx2, cancel2 := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel2() + q2 := c.Query(ctx2, "SELECT 2") + defer q2.Close() + var saw2 int64 + for b, err := range q2.Batches() { + if err != nil { + t.Fatalf("iter2 err: %v", err) + } + saw2 = b.Int64(0, 0) + } + if saw2 != 2 { + t.Errorf("saw2=%d, want 2 (stale query-1 error leaked into query 2?)", saw2) + } + if q2.TotalRows() != 1 { + t.Errorf("q2.TotalRows=%d, want 1", q2.TotalRows()) + } +} + +// TestQwpQueryInitialCreditReachesWire verifies that +// WithQwpQueryInitialCredit actually sets the initial_credit varint +// on the outgoing QUERY_REQUEST frame. The option is exercised by +// other unit tests only at the config level; this is the end-to-end +// wire probe. +func TestQwpQueryInitialCreditReachesWire(t *testing.T) { + gotCredit := make(chan int64, 1) + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, credit := parseQueryRequest(t, req) + gotCredit <- credit + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + }) + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(addr), + WithQwpQueryInitialCredit(65536), + ) + if err != nil { + t.Fatalf("ctor: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "SELECT 1") + defer q.Close() + for _, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + } + + select { + case got := <-gotCredit: + if got != 65536 { + t.Errorf("initial_credit on wire = %d, want 65536", got) + } + case <-time.After(2 * time.Second): + t.Fatal("server never saw QUERY_REQUEST") + } +} + +// TestQwpQueryCloseIdempotentAfterFinish locks in the documented +// contract that Close on an already-finished cursor is a safe no-op. +// Exercised via the CAS guard on q.state. +func TestQwpQueryCloseIdempotentAfterFinish(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + reqID, _, _ := parseQueryRequest(t, req) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + }) + defer cleanup() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1") + for _, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + } + // First Close after a normal iteration-to-End: no-op because the + // iterator's deferred state→Done already fired. Second Close: + // same, the CAS from Idle fails on Done state. Neither call + // should panic or block. + q.Close() + q.Close() +} + +// TestQwpQueryDrainAfterIteratorCtxExpiry reproduces the bug where +// Batches() yields (nil, ctx.Err()) without sending CANCEL or +// draining, leaving the dispatcher stuck in receiveLoop for the +// abandoned query. The iterator's deferred state→Done then poisons +// the q.Close() CAS so Close early-returns too, and the next +// c.Query() deadlocks on the single-slot requests channel (or, with a +// bounded ctx, returns a stale error instead of running cleanly). +// +// Exercises the takeEvent-error path specifically: the caller's ctx +// expires mid-wait before the server has sent anything. With the fix +// the iterator must CANCEL + drain on a cleanup ctx so the dispatcher +// returns to idle. +func TestQwpQueryDrainAfterIteratorCtxExpiry(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + // Query 1: send nothing; just wait for CANCEL, then echo + // CANCELLED. Mirrors a slow-server / timeout scenario. + req1 := m.readBinary(ctx) + reqID1, _, _ := parseQueryRequest(t, req1) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID1, byte(qwpStatusCancelled), "cancelled", -1))) + // Query 2: one batch + RESULT_END, proving the dispatcher + // returned to idle after query 1 was drained. + req2 := m.readBinary(ctx) + reqID2, _, _ := parseQueryRequest(t, req2) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID2, 0, "v", 99)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID2, 0, 1))) + }) + defer cleanup() + + // Query 1: short-deadline ctx. The iterator's first takeEvent + // returns ctx.Err() because the server sends nothing. The body + // accepts the (nil, err) without breaking so we exit via the + // takeEvent-error return path (the branch that lacked the drain). + ctx1, cancel1 := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel1() + q1 := c.Query(ctx1, "SELECT 1") + var iter1Err error + var iter1Batches int + for _, err := range q1.Batches() { + if err != nil { + iter1Err = err + continue + } + iter1Batches++ + } + if iter1Err == nil { + t.Fatalf("expected ctx-cancel error from iter1, got nil") + } + if iter1Batches != 0 { + t.Fatalf("iter1 batches=%d, want 0", iter1Batches) + } + // No-op with the current bug; with the fix, already drained by + // the iterator's exit path. + q1.Close() + + // Query 2 must reach RESULT_END within a reasonable timeout. With + // the bug, the dispatcher is still stuck in receiveLoop for query + // 1 so this never produces a batch. + ctx2, cancel2 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel2() + q2 := c.Query(ctx2, "SELECT 2") + defer q2.Close() + var saw2 int64 + for b, err := range q2.Batches() { + if err != nil { + t.Fatalf("iter2 err: %v", err) + } + saw2 = b.Int64(0, 0) + } + if saw2 != 99 { + t.Errorf("saw2=%d, want 99 (dispatcher stranded on query 1?)", saw2) + } + if q2.TotalRows() != 1 { + t.Errorf("q2.TotalRows=%d, want 1", q2.TotalRows()) + } +} + +// TestQwpExecDrainAfterCtxExpiry is the Exec-side counterpart of the +// Batches drain test. Exec's takeEvent loop returns on ctx.Err +// without CANCEL + drain, leaving the dispatcher stuck on the +// unfinished server-side query. A subsequent Exec must still work +// once the first Exec has returned. +func TestQwpExecDrainAfterCtxExpiry(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + // Exec 1: wait for CANCEL, echo CANCELLED. + req1 := m.readBinary(ctx) + reqID1, _, _ := parseQueryRequest(t, req1) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID1, byte(qwpStatusCancelled), "cancelled", -1))) + // Exec 2: EXEC_DONE to prove the dispatcher returned to idle. + req2 := m.readBinary(ctx) + reqID2, _, _ := parseQueryRequest(t, req2) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(reqID2, 0x07, 5))) + }) + defer cleanup() + + // Exec 1: short-deadline ctx → takeEvent returns ctx.Err(); Exec + // currently returns without cancelling/draining. + ctx1, cancel1 := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel1() + if _, err := c.Exec(ctx1, "INSERT INTO x VALUES (1)"); err == nil { + t.Fatalf("expected ctx error from Exec 1") + } + + // Exec 2 must complete. With the bug the dispatcher is still stuck + // on Exec 1, so Exec 2's takeEvent times out on ctx2. + ctx2, cancel2 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel2() + res, err := c.Exec(ctx2, "INSERT INTO x VALUES (2)") + if err != nil { + t.Fatalf("Exec 2 err (dispatcher stranded?): %v", err) + } + if res.OpType != 0x07 || res.RowsAffected != 5 { + t.Errorf("Exec 2 result=%+v, want OpType=0x07 RowsAffected=5", res) + } +} + +// TestQwpQueryYieldPanicReleasesBufferAndDrains verifies that a panic +// raised inside the Batches() yield body does not permanently leak the +// current batch buffer or strand the dispatcher on the in-flight query. +// Without the panic-safe release + drain, bufferPoolSize=1 starves on +// the first panic and a follow-up Query deadlocks on the dispatcher +// still parked in receiveLoop for query 1. +func TestQwpQueryYieldPanicReleasesBufferAndDrains(t *testing.T) { + c, cleanup := newMockQueryClient(t, 1, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + // Query 1: one batch, wait for CANCEL, echo CANCELLED. + req1 := m.readBinary(ctx) + reqID1, _, _ := parseQueryRequest(t, req1) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID1, 0, "v", 42)) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID1, byte(qwpStatusCancelled), "cancelled", -1))) + // Query 2: one batch + RESULT_END. Proves the pool has buffers + // available and the dispatcher is idle. + req2 := m.readBinary(ctx) + reqID2, _, _ := parseQueryRequest(t, req2) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID2, 0, "v", 99)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID2, 0, 1))) + }) + defer cleanup() + + // Query 1: panic from inside the yield body. Recover the panic so + // the test survives, and let defer q1.Close() run on the way out. + func() { + ctx1, cancel1 := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel1() + q1 := c.Query(ctx1, "SELECT 1") + defer q1.Close() + defer func() { + if r := recover(); r == nil { + t.Fatalf("expected panic from yield body") + } + }() + for _, err := range q1.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + panic("boom") + } + }() + + // Query 2 must complete. With the bug: + // - bufferPoolSize=1 and the batch buffer from query 1 is never + // returned to the pool, so the dispatcher's handleResultBatch + // blocks forever waiting for a free buffer on the next batch. + // - even before that point, the dispatcher is still parked in + // receiveLoop for query 1 (no CANCEL was ever sent, no drain + // happened), so query 2's takeEvent never wakes. + ctx2, cancel2 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel2() + q2 := c.Query(ctx2, "SELECT 2") + defer q2.Close() + var saw int64 + for b, err := range q2.Batches() { + if err != nil { + t.Fatalf("q2 err (dispatcher stranded?): %v", err) + } + saw = b.Int64(0, 0) + } + if saw != 99 { + t.Errorf("saw=%d, want 99", saw) + } + if q2.TotalRows() != 1 { + t.Errorf("q2.TotalRows=%d, want 1", q2.TotalRows()) + } +} + +// TestQwpQueryCloseIsNoOpWhileIterating verifies Close called from +// another goroutine while Batches() is in flight returns immediately +// and does not compete with the iterator for the dispatcher's single +// terminal event. Before the fix, Close's CAS guard only prevented +// double-close by the same caller; a concurrent Close and Batches +// both entered drainUntilTerminal, and whichever lost the race on the +// one terminal frame blocked until its cleanup ctx expired (5 s). +func TestQwpQueryCloseIsNoOpWhileIterating(t *testing.T) { + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + // Query 1: send one batch, then block until CANCEL arrives so + // the iterator stays parked in takeEvent while the test + // invokes Close concurrently. + req1 := m.readBinary(ctx) + reqID1, _, _ := parseQueryRequest(t, req1) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID1, 0, "v", 7)) + for { + frame := m.readBinary(ctx) + if frame[0] == byte(qwpMsgKindCancel) { + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody( + reqID1, byte(qwpStatusCancelled), "cancelled", -1))) + // Query 2: RESULT_END only — proves the dispatcher returned + // to idle via the iterator's own drain path. + req2 := m.readBinary(ctx) + reqID2, _, _ := parseQueryRequest(t, req2) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID2, 0, 0))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1") + + seen := make(chan int64, 1) + iterDone := make(chan struct{}) + go func() { + defer close(iterDone) + for b, err := range q.Batches() { + if err != nil { + return + } + seen <- b.Int64(0, 0) + } + }() + + // Wait until the iterator has yielded the first batch — it is + // now parked in takeEvent waiting on the next event. + select { + case v := <-seen: + if v != 7 { + t.Fatalf("seen=%d, want 7", v) + } + case <-time.After(2 * time.Second): + t.Fatal("iterator never yielded a batch") + } + + // Close must return quickly. With the bug it would race the + // iterator for the terminal event and block up to the 5 s + // cleanup timeout. + closeReturned := make(chan struct{}) + go func() { + q.Close() + close(closeReturned) + }() + select { + case <-closeReturned: + case <-time.After(500 * time.Millisecond): + t.Fatal("Close blocked while Batches iteration in flight") + } + + // The iterator is still parked. Cancel() triggers the server's + // CANCELLED echo, which the iterator swallows and exits cleanly. + q.Cancel() + select { + case <-iterDone: + case <-time.After(2 * time.Second): + t.Fatal("iterator did not end after Cancel") + } + + // Follow-up Query must complete — the dispatcher is idle because + // the iterator (not the racing Close) drained to the terminal + // frame. + q2 := c.Query(ctx, "SELECT 2") + defer q2.Close() + for _, err := range q2.Batches() { + if err != nil { + t.Fatalf("q2 err (dispatcher stranded?): %v", err) + } + } +} + +// --- Bind parameter tests --- + +// parseQueryRequestWithBinds parses a client-sent QUERY_REQUEST and +// returns the bind count plus the raw bind payload bytes, in addition +// to the usual tuple. Tests that exercise WithQwpQueryBinds assert against +// this richer view. +func parseQueryRequestWithBinds(t *testing.T, frame []byte) (int64, string, int64, int, []byte) { + t.Helper() + if len(frame) < 1+8 { + t.Fatalf("QUERY_REQUEST frame too short: %d", len(frame)) + } + if kind := frame[0]; kind != byte(qwpMsgKindQueryRequest) { + t.Fatalf("expected msg_kind 0x10, got 0x%02X", kind) + } + p := 1 + requestId := int64(binary.LittleEndian.Uint64(frame[p:])) + p += 8 + sqlLen, n, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad sql_len varint: %v", err) + } + p += n + sql := string(frame[p : p+int(sqlLen)]) + p += int(sqlLen) + credit, n, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad credit varint: %v", err) + } + p += n + bindCount, n, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad bind_count varint: %v", err) + } + p += n + return requestId, sql, int64(credit), int(bindCount), frame[p:] +} + +// TestQwpQueryWithBindsWiresBindPayload sends a query with mixed-type +// binds and asserts the server sees the pre-encoded bind bytes along +// with a matching bind_count. +func TestQwpQueryWithBindsWiresBindPayload(t *testing.T) { + const wantSQL = "SELECT * FROM trades WHERE sym = $1 AND price >= $2 AND ts >= $3 LIMIT 1000" + var gotFrame []byte + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + gotFrame = append(gotFrame, m.readBinary(ctx)...) + reqID, _, _, _, _ := parseQueryRequestWithBinds(t, gotFrame) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, wantSQL, WithQwpQueryBinds(func(b *QwpBinds) { + b.VarcharBind(0, "AAPL"). + DoubleBind(1, 100.0). + TimestampMicrosBind(2, 1_700_000_000_000_000) + })) + defer q.Close() + for _, err := range q.Batches() { + if err != nil { + t.Fatalf("iterator error: %v", err) + } + } + + _, sql, _, bindCount, bindPayload := parseQueryRequestWithBinds(t, gotFrame) + if sql != wantSQL { + t.Errorf("sql=%q, want %q", sql, wantSQL) + } + if bindCount != 3 { + t.Fatalf("bind_count=%d, want 3", bindCount) + } + + // Build the expected bind payload by running the same encoder + // against a fresh QwpBinds instance. This way the expected bytes + // live in exactly one place (the production encoder) and the test + // asserts only the wiring, not the encoding. + var expected QwpBinds + expected.VarcharBind(0, "AAPL"). + DoubleBind(1, 100.0). + TimestampMicrosBind(2, 1_700_000_000_000_000) + if !bytes.Equal(bindPayload, expected.bufferBytes()) { + t.Fatalf("bind payload mismatch:\n got: % x\nwant: % x", + bindPayload, expected.bufferBytes()) + } +} + +// TestQwpQueryWithBindsEmpty verifies a query with zero-argument binds +// (user passed WithQwpQueryBinds with no setter calls) sends bind_count=0 +// and an empty bind payload — equivalent to not using WithQwpQueryBinds +// at all. +func TestQwpQueryWithBindsEmpty(t *testing.T) { + var gotFrame []byte + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + gotFrame = append(gotFrame, m.readBinary(ctx)...) + reqID, _, _, _, _ := parseQueryRequestWithBinds(t, gotFrame) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1", WithQwpQueryBinds(func(b *QwpBinds) {})) + defer q.Close() + for _, err := range q.Batches() { + if err != nil { + t.Fatalf("iterator error: %v", err) + } + } + + _, _, _, bindCount, bindPayload := parseQueryRequestWithBinds(t, gotFrame) + if bindCount != 0 { + t.Errorf("bind_count=%d, want 0", bindCount) + } + if len(bindPayload) != 0 { + t.Errorf("bind payload should be empty, got % x", bindPayload) + } +} + +// TestQwpQueryWithBindsSurfacesEncodingError verifies a bind setter +// that produces a latched QwpBinds error (e.g. out-of-order index) +// fails the query through the iterator's first yield, without sending +// a QUERY_REQUEST to the server. +func TestQwpQueryWithBindsSurfacesEncodingError(t *testing.T) { + done := make(chan struct{}) + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + // Server should never see a frame for a bind-failing query. + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + _, _, err := m.conn.Read(ctx) + if err == nil { + t.Errorf("server received a frame despite bind error") + } + close(done) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + q := c.Query(ctx, "SELECT 1", WithQwpQueryBinds(func(b *QwpBinds) { + b.LongBind(0, 1) + b.LongBind(5, 2) // out-of-order + })) + defer q.Close() + + var sawErr error + for _, err := range q.Batches() { + if err != nil { + sawErr = err + break + } + } + if sawErr == nil { + t.Fatal("expected bind error to surface via Batches") + } + if !strings.Contains(sawErr.Error(), "out of order") { + t.Fatalf("unexpected error: %v", sawErr) + } + <-done +} + +// TestQwpExecWithBinds verifies WithQwpQueryBinds is plumbed through Exec, +// not just Query. Drives an EXEC_DONE against a bind-bearing UPDATE- +// style request. +func TestQwpExecWithBinds(t *testing.T) { + var gotFrame []byte + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + gotFrame = append(gotFrame, m.readBinary(ctx)...) + reqID, _, _, _, _ := parseQueryRequestWithBinds(t, gotFrame) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(reqID, 0x01, 42))) + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + res, err := c.Exec(ctx, "UPDATE trades SET price = $1 WHERE sym = $2", + WithQwpQueryBinds(func(b *QwpBinds) { + b.DoubleBind(0, 200.5).VarcharBind(1, "MSFT") + })) + if err != nil { + t.Fatalf("Exec: %v", err) + } + if res.RowsAffected != 42 { + t.Errorf("RowsAffected=%d, want 42", res.RowsAffected) + } + + _, _, _, bindCount, bindPayload := parseQueryRequestWithBinds(t, gotFrame) + if bindCount != 2 { + t.Fatalf("bind_count=%d, want 2", bindCount) + } + var expected QwpBinds + expected.DoubleBind(0, 200.5).VarcharBind(1, "MSFT") + if !bytes.Equal(bindPayload, expected.bufferBytes()) { + t.Fatalf("bind payload mismatch:\n got: % x\nwant: % x", + bindPayload, expected.bufferBytes()) + } +} + +// TestQwpQueryBindsResetAcrossCalls verifies the per-client bind +// scratch is reset between calls — a second query with fewer binds +// must not accidentally include the prior call's trailing bytes. +func TestQwpQueryBindsResetAcrossCalls(t *testing.T) { + frames := make(chan []byte, 2) + c, cleanup := newMockQueryClient(t, 2, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + for i := 0; i < 2; i++ { + f := m.readBinary(ctx) + frames <- f + reqID, _, _, _, _ := parseQueryRequestWithBinds(t, f) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 0))) + } + }) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // First query has 3 binds. + q1 := c.Query(ctx, "SELECT 1", WithQwpQueryBinds(func(b *QwpBinds) { + b.LongBind(0, 1).LongBind(1, 2).LongBind(2, 3) + })) + for _, err := range q1.Batches() { + if err != nil { + t.Fatalf("q1 err: %v", err) + } + } + q1.Close() + + // Second query has 1 bind — must not carry over the first two longs. + q2 := c.Query(ctx, "SELECT 2", WithQwpQueryBinds(func(b *QwpBinds) { + b.IntBind(0, 99) + })) + for _, err := range q2.Batches() { + if err != nil { + t.Fatalf("q2 err: %v", err) + } + } + q2.Close() + + close(frames) + got := make([][]byte, 0, 2) + for f := range frames { + got = append(got, f) + } + if len(got) != 2 { + t.Fatalf("expected 2 frames, got %d", len(got)) + } + _, _, _, count1, payload1 := parseQueryRequestWithBinds(t, got[0]) + _, _, _, count2, payload2 := parseQueryRequestWithBinds(t, got[1]) + if count1 != 3 { + t.Errorf("q1 bind_count=%d, want 3", count1) + } + if count2 != 1 { + t.Errorf("q2 bind_count=%d, want 1", count2) + } + + var wantPayload2 QwpBinds + wantPayload2.IntBind(0, 99) + if !bytes.Equal(payload2, wantPayload2.bufferBytes()) { + t.Fatalf("q2 payload mismatch (possible carry-over from q1):\n got: % x\nwant: % x", + payload2, wantPayload2.bufferBytes()) + } + // Sanity: payload1 must not be a prefix/subset of payload2 (i.e., they + // encode different things). + if bytes.Contains(payload2, payload1) { + t.Fatalf("q2 payload contains q1 payload — scratch not reset") + } +} diff --git a/qwp_query_conf.go b/qwp_query_conf.go new file mode 100644 index 00000000..ab45e164 --- /dev/null +++ b/qwp_query_conf.go @@ -0,0 +1,628 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "fmt" + "strconv" + "strings" + "time" +) + +// qwpQueryClientConfig is the internal configuration of QwpQueryClient. +// Populated either by functional options (NewQwpQueryClient) or by the +// ws:: / wss:: config-string parser (QwpQueryClientFromConf). The +// options surface is deliberately smaller than the ingest LineSender's +// — QWP egress has its own concerns (buffer pool depth, max batch +// rows) and does not inherit ILP-era knobs. +type qwpQueryClientConfig struct { + // endpoints is the ordered list of WebSocket endpoints the connect + // walk attempts. The first matching the target= filter wins; + // transient failures during the walk skip to the next entry. The + // failover orchestrator reuses the same list for reconnect. + // Default is one entry pointing at defaultHttpAddress. + endpoints []qwpEndpoint + // endpointPath is the HTTP path used for the WebSocket upgrade. + // Default "/read/v1". + endpointPath string + // authorization, when non-empty, is sent verbatim as the + // Authorization HTTP header. Mutually exclusive with user/pass and + // token. + authorization string + // httpUser / httpPass populate an HTTP Basic Authorization header + // at connect time. Mutually exclusive with authorization and token. + httpUser string + httpPass string + // httpToken populates a Bearer Authorization header at connect + // time. Mutually exclusive with authorization and user/pass. + httpToken string + // clientID overrides the default X-QWP-Client-Id header. Empty + // uses the module default (qwpClientId). + clientID string + // bufferPoolSize is the depth of the decode buffer pool. Default + // qwpDefaultEgressBufferPoolSize. Must be >= 1. + bufferPoolSize int + // maxBatchRows caps per-batch row count the server emits. 0 omits + // the X-QWP-Max-Batch-Rows header and lets the server use its cap. + maxBatchRows int + // initialCredit is the egress flow-control budget. 0 = unbounded + // (no CREDIT bookkeeping). A positive value streams at most N + // bytes before the server parks; the client auto-replenishes as + // the consumer releases each batch. + initialCredit int64 + // compression is the preference the client advertises on the + // upgrade handshake. One of "raw", "zstd", "auto". Default "raw" + // matches Java's library default — no compression, no handshake + // header, no server-side encode cost. "zstd" asks for zstd first + // and falls back to raw; "auto" advertises both and lets the + // server pick. + compression string + // compressionLevel is the zstd level hint sent via the accept- + // encoding header. Ignored when compression == "raw". Clamped + // server-side to [1, 9]; client accepts [1, 22] matching Java. + compressionLevel int + // tlsMode mirrors lineSenderConfig's three-valued TLS state. + tlsMode tlsMode + + // target constrains the connect walk by SERVER_INFO.role. Default + // is qwpTargetAny, which accepts any role and so needs no role + // byte. qwpTargetPrimary and qwpTargetReplica do: if the client + // does not consume SERVER_INFO (serverInfoTimeout disabled) or the + // server sends no parseable frame, the role is unknown and the + // filter cannot be evaluated. + target QwpTargetFilter + // zone is the client's opaque, case-insensitive locality hint + // (failover.md §1.1). When set and target != primary, the host + // tracker prefers endpoints whose server-advertised zone_id + // (SERVER_INFO.zone_id under CAP_ZONE, or the X-QuestDB-Zone + // header on a 421 reject) matches, via the (state, zone) priority + // lattice. Empty (the default) collapses every host to the Same + // tier, i.e. zone-blind selection. Shared verbatim with the + // ingest connect string, where it is accepted-but-inert (the + // ingestion path does not route by zone). + zone string + // authTimeoutMs is the failover.md §1.1 per-host upper bound on + // the HTTP upgrade response read (the wait between writing the + // upgrade request and reading the response headers). It does NOT + // cover TCP connect, TLS handshake, or the post-upgrade + // SERVER_INFO frame read (that uses serverInfoTimeout). Default + // qwpDefaultAuthTimeoutMs (15_000); must be > 0. + authTimeoutMs int + // failoverEnabled toggles transparent reconnect-and-replay on + // transport-terminal failure mid-query. Default true; matches + // Java's failover=on default. When false, transport errors + // surface directly through Batches() / Exec(). + failoverEnabled bool + // failoverMaxAttempts caps the number of executeOnce invocations + // per Query / Exec. Counts the initial attempt plus every + // reconnect retry. Default qwpDefaultFailoverMaxAttempts. + failoverMaxAttempts int + // failoverBackoffInitial is the initial sleep between reconnect + // attempts. Doubled on each subsequent attempt up to + // failoverBackoffMax. Default qwpDefaultFailoverInitialBackoff. + failoverBackoffInitial time.Duration + // failoverBackoffMax caps the exponential backoff. Default + // qwpDefaultFailoverMaxBackoff. + failoverBackoffMax time.Duration + // failoverMaxDuration is the total wall-clock cap on the per- + // Query/Exec failover loop. Whichever of this or + // failoverMaxAttempts fires first ends the loop. 0 disables the + // time cap (failover then bounded only by attempts). Default + // qwpDefaultFailoverMaxDuration; matches Java's + // DEFAULT_FAILOVER_MAX_DURATION_MS. + failoverMaxDuration time.Duration + // serverInfoTimeout bounds the synchronous read of SERVER_INFO + // after each upgrade. The server always emits SERVER_INFO as the + // first post-upgrade frame, so the drain is mandatory on egress; + // must be > 0. Default qwpDefaultServerInfoTimeout. + serverInfoTimeout time.Duration + // replayExec opts Exec into transparent replay on transport- + // terminal failures. Default false — non-idempotent statements + // (INSERT/UPDATE/DELETE/DDL) might double-execute on a transport + // drop after the server applied the statement. Callers that know + // their statements are idempotent can opt in via + // WithQwpQueryReplayExec(true). + replayExec bool +} + +// qwpCompressionRaw / qwpCompressionZstd / qwpCompressionAuto are the +// three valid values for qwpQueryClientConfig.compression. "raw" is +// the library default: omits the accept-encoding header entirely so +// servers that do not know about compression see an unchanged +// handshake. +const ( + qwpCompressionRaw = "raw" + qwpCompressionZstd = "zstd" + qwpCompressionAuto = "auto" +) + +// qwpDefaultCompressionLevel matches Java QwpQueryClient's compression +// level default (Sender.java compressionLevel field = 1; see also +// connect-string.md §Query client keys: "Default `1` — the cheapest +// server-side CPU"). Only relevant when compression != "raw". +const qwpDefaultCompressionLevel = 1 + +// qwpDefaultEgressBufferPoolSize is the I/O decode pool depth when the +// caller hasn't overridden it. Matches the Java client default +// (DEFAULT_IO_BUFFER_POOL_SIZE = 4): four slots let the dispatcher keep +// decoding ~4 batches ahead of a slow consumer before the buffer pool +// drains and back-pressures the WebSocket via the TCP window. +const qwpDefaultEgressBufferPoolSize = 4 + +// Failover defaults — match Java QwpQueryClient.DEFAULT_FAILOVER_*. +const ( + // qwpDefaultFailoverMaxAttempts is the cap on executeOnce + // invocations per Query/Exec call. Counts the initial attempt + // plus every reconnect retry. Java's DEFAULT_FAILOVER_MAX_ATTEMPTS + // = 8. + qwpDefaultFailoverMaxAttempts = 8 + // qwpDefaultFailoverInitialBackoff is the initial sleep between + // reconnect attempts; doubled per retry up to + // qwpDefaultFailoverMaxBackoff. Java's + // DEFAULT_FAILOVER_INITIAL_BACKOFF_MS = 50. + qwpDefaultFailoverInitialBackoff = 50 * time.Millisecond + // qwpDefaultFailoverMaxBackoff caps the exponential backoff. + // Java's DEFAULT_FAILOVER_MAX_BACKOFF_MS = 1000. + qwpDefaultFailoverMaxBackoff = 1 * time.Second + // qwpDefaultFailoverMaxDuration is the total wall-clock cap on the + // per-Query/Exec failover loop; 0 would disable the cap. Java's + // DEFAULT_FAILOVER_MAX_DURATION_MS = 30_000. + qwpDefaultFailoverMaxDuration = 30 * time.Second + // qwpDefaultServerInfoTimeout bounds the synchronous SERVER_INFO + // read after the upgrade. Java's DEFAULT_SERVER_INFO_TIMEOUT_MS = + // 5000. + qwpDefaultServerInfoTimeout = 5 * time.Second + // qwpDefaultAuthTimeoutMs is the per-host upgrade-response-read + // bound when the caller hasn't overridden it. failover.md §1.1 + // default (15_000); matches the ingest sender default and Java's + // DEFAULT_AUTH_TIMEOUT_MS so a shared connect string behaves + // identically on both clients. + qwpDefaultAuthTimeoutMs = 15_000 +) + +// qwpQueryDefaultConfig returns the zero-arg default config. Used as +// the seed for both the functional-options path and the config-string +// path. Seeds endpoints with a single entry pointing at the local +// QuestDB default; functional options or addr= override it. +func qwpQueryDefaultConfig() *qwpQueryClientConfig { + return &qwpQueryClientConfig{ + endpoints: []qwpEndpoint{{host: "127.0.0.1", port: qwpDefaultPort}}, + endpointPath: qwpReadPath, // "/read/v1" + bufferPoolSize: qwpDefaultEgressBufferPoolSize, + compression: qwpCompressionRaw, + compressionLevel: qwpDefaultCompressionLevel, + target: qwpTargetAny, + failoverEnabled: true, + failoverMaxAttempts: qwpDefaultFailoverMaxAttempts, + failoverBackoffInitial: qwpDefaultFailoverInitialBackoff, + failoverBackoffMax: qwpDefaultFailoverMaxBackoff, + failoverMaxDuration: qwpDefaultFailoverMaxDuration, + serverInfoTimeout: qwpDefaultServerInfoTimeout, + authTimeoutMs: qwpDefaultAuthTimeoutMs, + } +} + +// buildAcceptEncodingHeader translates the user's compression +// preference into the X-QWP-Accept-Encoding header value. "raw" +// returns an empty string so the transport omits the header entirely +// (Java parity — servers that pre-date egress compression see an +// unchanged handshake). "zstd" and "auto" both advertise +// "zstd;level=N,raw"; the server picks one. Mirrors Java's +// QwpQueryClient.buildAcceptEncodingHeader. +func (c *qwpQueryClientConfig) buildAcceptEncodingHeader() string { + if c.compression == qwpCompressionRaw { + return "" + } + return fmt.Sprintf("zstd;level=%d,raw", c.compressionLevel) +} + +// validate is the single-source sanity gate shared by both config entry +// points. Runs after options/conf-string parsing and before any network +// I/O. Mirrors Java QwpQueryClient.fromConfig's final cross-field +// checks (mutually-exclusive auth modes, TLS-only roots keys, bufferPool +// >= 1) plus the host-required check pushed into the Go parser. +func (c *qwpQueryClientConfig) validate() error { + if len(c.endpoints) == 0 { + return fmt.Errorf("qwp query: no endpoints configured") + } + for i, ep := range c.endpoints { + if ep.host == "" { + return fmt.Errorf("qwp query: endpoint %d has empty host", i) + } + if ep.port < 1 || ep.port > 65535 { + return fmt.Errorf("qwp query: endpoint %d port %d out of range [1, 65535]", + i, ep.port) + } + } + if c.endpointPath == "" { + return fmt.Errorf("qwp query: endpoint path is empty") + } + if c.bufferPoolSize < 1 { + return fmt.Errorf("qwp query: buffer pool size must be >= 1, got %d", c.bufferPoolSize) + } + if c.maxBatchRows < 0 { + return fmt.Errorf("qwp query: max batch rows must be >= 0, got %d", c.maxBatchRows) + } + if c.maxBatchRows > qwpMaxRowsPerBatch { + return fmt.Errorf("qwp query: max batch rows %d exceeds client cap %d", + c.maxBatchRows, qwpMaxRowsPerBatch) + } + if c.initialCredit < 0 { + return fmt.Errorf("qwp query: initial credit must be >= 0, got %d", c.initialCredit) + } + switch c.compression { + case qwpCompressionRaw, qwpCompressionZstd, qwpCompressionAuto: + // ok + default: + return fmt.Errorf( + "qwp query: unsupported compression %q (expected raw, zstd, or auto)", + c.compression) + } + if c.compressionLevel < 1 || c.compressionLevel > 22 { + return fmt.Errorf( + "qwp query: compression level must be in [1, 22], got %d", + c.compressionLevel) + } + basicSet := c.httpUser != "" || c.httpPass != "" + authModes := 0 + if c.authorization != "" { + authModes++ + } + if basicSet { + authModes++ + } + if c.httpToken != "" { + authModes++ + } + if authModes > 1 { + return fmt.Errorf("qwp query: auth, username/password, and token are mutually exclusive") + } + if basicSet && (c.httpUser == "" || c.httpPass == "") { + return fmt.Errorf("qwp query: both username and password must be provided together") + } + if c.failoverMaxAttempts < 1 { + return fmt.Errorf( + "qwp query: failover_max_attempts must be >= 1, got %d", c.failoverMaxAttempts) + } + if c.failoverBackoffInitial < 0 { + return fmt.Errorf( + "qwp query: failover_backoff_initial must be >= 0, got %v", + c.failoverBackoffInitial) + } + if c.failoverBackoffMax < 0 { + return fmt.Errorf( + "qwp query: failover_backoff_max must be >= 0, got %v", + c.failoverBackoffMax) + } + if c.failoverBackoffMax < c.failoverBackoffInitial { + return fmt.Errorf( + "qwp query: failover_backoff_max (%v) must be >= failover_backoff_initial (%v)", + c.failoverBackoffMax, c.failoverBackoffInitial) + } + if c.failoverMaxDuration < 0 { + return fmt.Errorf( + "qwp query: failover_max_duration must be >= 0, got %v", + c.failoverMaxDuration) + } + if c.serverInfoTimeout <= 0 { + return fmt.Errorf( + "qwp query: server_info_timeout must be > 0, got %v", c.serverInfoTimeout) + } + if c.authTimeoutMs <= 0 { + return fmt.Errorf( + "qwp query: auth_timeout_ms must be > 0, got %d", c.authTimeoutMs) + } + if c.target > qwpTargetReplica { + return fmt.Errorf("qwp query: invalid target %d (expected any, primary, or replica)", + byte(c.target)) + } + return nil +} + +// addressString returns a comma-joined "host:port,..." form of the +// configured endpoints. Used by error messages and tests; not part of +// the public API. +func (c *qwpQueryClientConfig) addressString() string { + parts := make([]string, 0, len(c.endpoints)) + for _, ep := range c.endpoints { + parts = append(parts, ep.String()) + } + return strings.Join(parts, ",") +} + +// splitQwpHostPort splits a single host[:port] entry. Returns the host +// (with surrounding brackets stripped, if any), the port string (empty +// when no port was supplied), and a structural error for malformed +// bracketed forms. The port string is returned untrimmed so the caller +// can produce a useful error message; numeric validation happens in +// parseEndpointList. +// +// Forms accepted: +// - "host" — bare host, port defaults to qwpDefaultPort +// - "host:port" — explicit port; validated against [1, 65535] +// - "[ipv6]:port" — bracketed IPv6 with port +// - "[ipv6]" — bracketed IPv6 without port +// - "ipv6::with::colons" — bare IPv6 (>=2 colons unbracketed) +// +// Rejected (by parseEndpointList using these errors): +// - empty string +// - empty bracketed host: "[]:port" +// - missing closing ']': "[::1:9000" +// - trailing garbage after ']': "[::1]9000" +// - port out of [1, 65535] +// - non-numeric port +func splitQwpHostPort(s string) (host, port string, err error) { + if strings.HasPrefix(s, "[") { + end := strings.IndexByte(s, ']') + if end < 0 { + return "", "", fmt.Errorf("missing closing ']' in IPv6 address") + } + host = s[1:end] + rest := s[end+1:] + switch { + case rest == "": + return host, "", nil + case rest[0] == ':': + return host, rest[1:], nil + default: + return "", "", fmt.Errorf("expected ':' after ']' in IPv6 address") + } + } + // No brackets: count colons. + colons := strings.Count(s, ":") + switch colons { + case 0: + return s, "", nil + case 1: + i := strings.IndexByte(s, ':') + return s[:i], s[i+1:], nil + default: + // Multi-colon, unbracketed → bare IPv6 host without port. + // A custom port on IPv6 requires brackets. + return s, "", nil + } +} + +// parseQwpQueryConf parses a ws:: / wss:: config string into a +// qwpQueryClientConfig. The supported key set mirrors Java +// QwpQueryClient.fromConfig, except tls_roots / tls_roots_password, +// which aren't supported. +func parseQwpQueryConf(conf string) (*qwpQueryClientConfig, error) { + data, err := parseConfigStr(conf) + if err != nil { + return nil, err + } + cfg := qwpQueryDefaultConfig() + switch data.Schema { + case "ws", "qwpws": + // connect-string.md §Protocols and transports: qwpws / + // qwpwss are long-form aliases for ws / wss. + cfg.tlsMode = tlsDisabled + case "wss", "qwpwss": + cfg.tlsMode = tlsEnabled + default: + return nil, NewInvalidConfigStrError( + "invalid schema %q, expected ws, wss, qwpws, or qwpwss", data.Schema) + } + tlsVerifySet := false + + for k, v := range data.KeyValuePairs { + switch k { + case "addr": + eps, err := parseEndpointList(v, qwpDefaultPort) + if err != nil { + return nil, NewInvalidConfigStrError("%v", err) + } + cfg.endpoints = eps + case "path": + cfg.endpointPath = v + case "auth": + cfg.authorization = v + case "username": + cfg.httpUser = v + case "password": + cfg.httpPass = v + case "token": + cfg.httpToken = v + case "client_id": + cfg.clientID = v + case "buffer_pool_size": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError("invalid buffer_pool_size %q: %v", v, err) + } + cfg.bufferPoolSize = n + case "max_batch_rows": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError("invalid max_batch_rows %q: %v", v, err) + } + cfg.maxBatchRows = n + case "initial_credit": + n, err := strconv.ParseInt(v, 10, 64) + if err != nil { + return nil, NewInvalidConfigStrError("invalid initial_credit %q: %v", v, err) + } + cfg.initialCredit = n + case "compression": + switch v { + case qwpCompressionRaw, qwpCompressionZstd, qwpCompressionAuto: + cfg.compression = v + default: + return nil, NewInvalidConfigStrError( + "invalid compression %q, expected raw, zstd, or auto", v) + } + case "compression_level": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid compression_level %q: %v", v, err) + } + cfg.compressionLevel = n + case "tls_verify": + switch v { + case "on": + cfg.tlsMode = tlsEnabled + case "unsafe_off": + cfg.tlsMode = tlsInsecureSkipVerify + default: + return nil, NewInvalidConfigStrError( + "invalid tls_verify %q, expected on or unsafe_off", v) + } + tlsVerifySet = true + case "tls_roots": + return nil, NewInvalidConfigStrError("tls_roots is not available in the go client") + case "tls_roots_password": + return nil, NewInvalidConfigStrError("tls_roots_password is not available in the go client") + case "target": + t, err := parseTargetFilter(v) + if err != nil { + return nil, NewInvalidConfigStrError("%v", err) + } + cfg.target = t + case "zone": + // Opaque locality hint (failover.md §1.1). Stored verbatim; + // the host tracker lowercases for case-insensitive compare. + // Accepted here so a single connect string can be shared + // with the ingest client (where the same key is + // accepted-but-inert). + cfg.zone = v + case "auth_timeout_ms": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid auth_timeout_ms %q: %v", v, err) + } + if n <= 0 { + return nil, NewInvalidConfigStrError( + "auth_timeout_ms must be > 0, got %d", n) + } + cfg.authTimeoutMs = n + case "failover": + switch v { + case "on": + cfg.failoverEnabled = true + case "off": + cfg.failoverEnabled = false + default: + return nil, NewInvalidConfigStrError( + "invalid failover %q, expected on or off", v) + } + case "failover_max_attempts": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid failover_max_attempts %q: %v", v, err) + } + if n < 1 { + return nil, NewInvalidConfigStrError( + "failover_max_attempts must be >= 1, got %d", n) + } + cfg.failoverMaxAttempts = n + case "failover_backoff_initial_ms": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid failover_backoff_initial_ms %q: %v", v, err) + } + if n < 0 { + return nil, NewInvalidConfigStrError( + "failover_backoff_initial_ms must be >= 0, got %d", n) + } + cfg.failoverBackoffInitial = time.Duration(n) * time.Millisecond + case "failover_backoff_max_ms": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid failover_backoff_max_ms %q: %v", v, err) + } + if n < 0 { + return nil, NewInvalidConfigStrError( + "failover_backoff_max_ms must be >= 0, got %d", n) + } + cfg.failoverBackoffMax = time.Duration(n) * time.Millisecond + case "failover_max_duration_ms": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid failover_max_duration_ms %q: %v", v, err) + } + if n < 0 { + return nil, NewInvalidConfigStrError( + "failover_max_duration_ms must be >= 0, got %d", n) + } + cfg.failoverMaxDuration = time.Duration(n) * time.Millisecond + case "server_info_timeout_ms": + n, err := strconv.Atoi(v) + if err != nil { + return nil, NewInvalidConfigStrError( + "invalid server_info_timeout_ms %q: %v", v, err) + } + if n <= 0 { + return nil, NewInvalidConfigStrError( + "server_info_timeout_ms must be > 0, got %d", n) + } + cfg.serverInfoTimeout = time.Duration(n) * time.Millisecond + case "replay_exec": + switch v { + case "on": + cfg.replayExec = true + case "off": + cfg.replayExec = false + default: + return nil, NewInvalidConfigStrError( + "invalid replay_exec %q, expected on or off", v) + } + default: + if ingressOnlyKeys[k] { + // Silently accepted on egress so a single ws:: / wss:: + // connect string can drive both Sender and + // QwpQueryClient. The QwpQueryClient does not + // interpret the value — range/enum/type checks run on + // the ingress side (conf_parse.go). + // connect-string.md §16-20 is the load-bearing spec + // text. + continue + } + return nil, NewInvalidConfigStrError("unsupported option %q", k) + } + } + + // tls_verify gates the TLS handshake; only meaningful on wss/qwpwss. + if tlsVerifySet && (data.Schema == "ws" || data.Schema == "qwpws") { + return nil, NewInvalidConfigStrError("tls_verify requires the wss:: schema") + } + + // Wrap validate's plain errors as *InvalidConfigStrError so a caller + // that came in via the conf-string path sees one consistent error + // type — both the per-key parse errors above and the cross-field + // validation errors below. The functional-options path + // (NewQwpQueryClient) calls validate() directly and keeps the plain + // fmt.Errorf form, where "config string" framing would be wrong. + if err := cfg.validate(); err != nil { + return nil, NewInvalidConfigStrError("%v", err) + } + return cfg, nil +} diff --git a/qwp_query_decoder.go b/qwp_query_decoder.go new file mode 100644 index 00000000..b7bff432 --- /dev/null +++ b/qwp_query_decoder.go @@ -0,0 +1,1299 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "fmt" + "unsafe" + + // Pure-Go zstd via klauspost/compress. + // Future option for higher throughput: github.com/valyala/gozstd (cgo + // wrapper around libzstd; ~1.5-2x faster decompression at the cost of + // requiring a C toolchain and making cross-compilation harder). + "github.com/klauspost/compress/zstd" +) + +// qwpZstdMaxDecompressedSize caps the decompressed payload of a single +// RESULT_BATCH frame. Mirrors Java QwpResultBatchDecoder.MAX_SCRATCH +// (64 MiB). The decoder reads the zstd frame header's content-size +// field up front and rejects anything larger — this both short-circuits +// obvious bombs and lets us size the scratch in one allocation. +const qwpZstdMaxDecompressedSize = 64 * 1024 * 1024 + +// qwpZstdMinScratchGrow is the floor when growing the per-batch zstd +// scratch buffer. Matches Java's MIN_SCRATCH — amortises the first +// allocation so bursts of small batches don't re-alloc on every frame. +const qwpZstdMinScratchGrow = 1024 * 1024 + +// Exported op-type codes for ExecResult.OpType, mirroring the server's +// CompiledQuery.TYPE_* discriminators. The set covers the statements an +// EXEC_DONE frame commonly reports; OpType is the raw server byte, so a +// less common statement can carry a value outside this list. SELECT is +// absent on purpose — a SELECT streams RESULT_BATCH frames, never an +// EXEC_DONE. +const ( + QwpOpTypeInsert byte = 2 + QwpOpTypeTruncate byte = 3 + QwpOpTypeAlter byte = 4 + QwpOpTypeDrop byte = 7 + QwpOpTypeCreateTable byte = 9 + QwpOpTypeInsertAsSelect byte = 10 + QwpOpTypeRenameTable byte = 12 + QwpOpTypeUpdate byte = 14 + QwpOpTypeCreateTableAsSelect byte = 21 +) + +// ExecResult is the outcome of a non-SELECT statement (DDL / INSERT / +// UPDATE / ...) submitted via the QWP egress protocol. It mirrors the +// body of an EXEC_DONE frame. +type ExecResult struct { + // OpType is the server's CompiledQuery.TYPE_* discriminator for + // the executed statement, surfaced so callers can distinguish + // INSERT from UPDATE from pure DDL. Compare against the QwpOpType* + // constants; an unrecognised value is still a valid raw server byte. + OpType byte + + // RowsAffected is the number of rows modified. 0 for pure DDL. + RowsAffected int64 +} + +// qwpConnDict is the connection-scoped symbol dictionary. The server +// sends a delta section at the head of every RESULT_BATCH listing +// symbols assigned since the previous batch; the decoder appends them +// to the heap + entries arrays here. Subsequent batches refer to +// prior dictionary ids without retransmitting the strings. +// +// Within a single dict generation the heap is append-only, which +// keeps a qwpSymbolDictView snapshot taken during decode valid even +// if the user's handler is still iterating a previous batch. A +// CACHE_RESET crosses the generation boundary by swapping to a fresh +// backing array (see clear); pre-reset snapshots keep the old array +// alive via their own slice headers, so snapshot validity is +// preserved across the reset. Growth within a generation is amortised +// by Go's append; no explicit capacity tuning needed. +type qwpConnDict struct { + heap []byte + entries []qwpSymbolEntry +} + +// size returns the current number of entries. +func (d *qwpConnDict) size() int { return len(d.entries) } + +// appendDelta consumes the delta-dictionary section at the current +// position of br: (deltaStart, deltaCount, per-entry len+bytes). The +// server is required to send deltaStart == d.size() (otherwise the two +// ends are out of sync); any other value is a decoder-side rejection. +func (d *qwpConnDict) appendDelta(br *qwpByteReader) error { + deltaStart, err := br.readVarintInt63() + if err != nil { + return err + } + deltaCount, err := br.readVarintInt63() + if err != nil { + return err + } + // Reject hostile (deltaStart, deltaCount) before any allocation. + // The entry-count cap also guards the per-entry uint32 offset + // path below: with size capped at qwpMaxConnDictSize and heap + // capped at qwpMaxConnDictHeapBytes (both well below 1<<32), + // uint32(len(d.heap)) cannot overflow. + if deltaStart < 0 || deltaCount < 0 || + deltaStart > qwpMaxConnDictSize || + deltaCount > int64(qwpMaxConnDictSize)-deltaStart { + return newQwpDecodeError(fmt.Sprintf( + "delta symbol section out of range: start=%d count=%d", + deltaStart, deltaCount)) + } + if int(deltaStart) != d.size() { + return newQwpDecodeError(fmt.Sprintf( + "delta symbol dict out of sync: expected start=%d, got=%d", + d.size(), deltaStart)) + } + // Hoist buf+pos as locals so the per-entry varint read can stay a + // one-byte load+branch. The function-call boundary of + // readVarintInt63 / qwpReadVarint blocks inlining; symbol entries + // are typically short strings whose length encodes in a single byte. + buf := br.buf + bufLen := len(buf) + pos := br.pos + for i := int64(0); i < deltaCount; i++ { + var entryLen uint64 + if pos < bufLen && buf[pos] < 0x80 { + entryLen = uint64(buf[pos]) + pos++ + } else { + br.pos = pos + v, err := br.readVarintInt63() + if err != nil { + return err + } + pos = br.pos + entryLen = uint64(v) + } + // Heap-byte cap. Check before the body-fits-in-buffer test so + // a hostile advertised entryLen near uint64-max is rejected at + // the cap rather than misinterpreted by the bufLen-pos + // subtraction. uint64 arithmetic keeps len(d.heap)+entryLen + // from wrapping past int max. The cap is also what keeps the + // uint32 offset stored below from wrapping. + if uint64(len(d.heap))+entryLen > qwpMaxConnDictHeapBytes { + br.pos = pos + return newQwpDecodeError(fmt.Sprintf( + "connection SYMBOL dict heap exceeds cap (%d bytes); server must emit CACHE_RESET", + qwpMaxConnDictHeapBytes)) + } + if entryLen > uint64(bufLen-pos) { + br.pos = pos + return newQwpDecodeError("unexpected end of buffer while slicing") + } + end := pos + int(entryLen) + offset := uint32(len(d.heap)) + d.heap = append(d.heap, buf[pos:end]...) + d.entries = append(d.entries, qwpSymbolEntry{ + offset: offset, + length: uint32(entryLen), + }) + pos = end + } + br.pos = pos + return nil +} + +// snapshot returns a qwpSymbolDictView bound to the current heap + +// entries state. The slice headers freeze at call time, so even if +// d.dict.entries is later grown via append, the returned view keeps +// the old length (and either the old backing array on reallocation, +// or the old length into the same array). Because the heap is +// append-only, bytes addressed by the frozen entries stay valid. +func (d *qwpConnDict) snapshot() qwpSymbolDictView { + return qwpSymbolDictView{ + heap: d.heap, + entries: d.entries, + } +} + +// clear resets the dict so the next delta section restarts at id 0. +// Fresh backing arrays are allocated with the old capacities so a +// workload that churns just above the server's soft cap settles back +// to a stable size in one allocation instead of paying log N append +// grow-copies. Critically, swapping in new arrays (rather than +// truncating via [:0]) detaches any live qwpSymbolDictView snapshots +// a user handler is still iterating on a prior batch: those snapshots +// keep the old backing store alive via their own slice headers, and +// subsequent appendDelta writes into the fresh array cannot corrupt +// the bytes those snapshots address. +func (d *qwpConnDict) clear() { + d.heap = make([]byte, 0, cap(d.heap)) + d.entries = make([]qwpSymbolEntry, 0, cap(d.entries)) +} + +// qwpQueryDecoder is a stateful, reusable decoder for RESULT_BATCH +// frames. One instance per connection: it accumulates the symbol +// dictionary across the connection and holds the current query's +// schema between that query's batches. Decoding is zero-copy where +// possible — column-layout slices alias into the payload []byte the +// caller hands to decode(). +// +// The decoder owns connection-scoped state (dict) and per-query state +// (the schema parsed from the first batch of the current query) but +// NOT the per-batch layout pool. Each caller's out.layouts slice is +// grown/reused in place by decode(), so two batches whose buffers +// the I/O goroutine alternates between never share layout storage. +// That in turn lets the I/O goroutine emit batch N and immediately +// decode batch N+1 without corrupting batch N's view. +// +// The decoder is not safe for concurrent use. +type qwpQueryDecoder struct { + // negotiatedVersion is the QWP wire-protocol version the transport + // settled on during the HTTP upgrade. Every server-to-client frame's + // header version byte must equal this value — the spec (§3) requires + // strict equality with the negotiated version. With a single + // protocol version the negotiated value is always qwpVersion. Set + // once before the first decode call (via qwpEgressIO.start) and + // never mutated afterwards. + negotiatedVersion byte + + dict qwpConnDict + gorilla qwpGorillaDecoder + br qwpByteReader + deltaOn bool // current frame has FLAG_DELTA_SYMBOL_DICT set + gorillaOn bool // current frame has FLAG_GORILLA set + zstdOn bool // current frame has FLAG_ZSTD set + + // querySchema holds the column schema parsed from the first batch + // (batch_seq == 0) of the current query. Continuation batches + // (batch_seq > 0) omit the schema on the wire and reuse it. The + // I/O dispatcher calls resetQuerySchema at the start of every query + // (qwpEgressIO.dispatcherRun) so a schema from a prior query is + // never read across query boundaries. querySchemaValid separates + // "schema parsed" from "no batch seen yet" — a continuation batch + // arriving before its schema batch is a protocol error. decode() + // aliases querySchema into qwpColumnBatch.columns rather than + // copying, so a QwpColumnBatch the user still holds keeps its own + // reference even after the next query resets the slot. + querySchema []qwpColumnSchemaInfo + querySchemaValid bool + + // zstdDec is lazy-initialised on the first FLAG_ZSTD frame the + // decoder sees. One decoder per connection; reused across every + // compressed batch. klauspost/compress/zstd is designed to be + // reused — DecodeAll is stateless above the decoder goroutines. + // Concurrency is pinned to 1 because the dispatcher only ever + // calls decode on one frame at a time; the default (GOMAXPROCS) + // spawns more workers than we have frames. + zstdDec *zstd.Decoder +} + +// close releases decoder-owned resources. Idempotent. Called from the +// dispatcher's exit defer so the zstd library's internal goroutines do +// not outlive the I/O goroutines. Must be called after the last decode +// on this instance. +func (d *qwpQueryDecoder) close() { + if d.zstdDec != nil { + d.zstdDec.Close() + d.zstdDec = nil + } +} + +// resetQuerySchema drops the schema held for the previous query so the +// next query's first batch (batch_seq == 0) re-parses it from the +// wire. The dispatcher calls this at the start of every query, before +// any of that query's batches are decoded. Dropping the slice releases +// the decoder's reference; a QwpColumnBatch the user still holds keeps +// the prior schema alive through its own alias. +func (d *qwpQueryDecoder) resetQuerySchema() { + d.querySchema = nil + d.querySchemaValid = false +} + +// decode parses the payload of a RESULT_BATCH frame into out. The +// caller must have already accepted the outer WebSocket frame; payload +// is the full frame bytes (12-byte header + message kind byte + +// per-kind body). On success, `out` is populated with slice views into +// payload and is valid until the caller reuses payload. +// +// Caller contract: the returned batch's slices alias payload. Do not +// reuse payload (or close the WebSocket buffer that backs it) until +// the caller is done reading out. +func (d *qwpQueryDecoder) decode(payload []byte, out *QwpColumnBatch) error { + // Spec §14 caps a RESULT_BATCH at 16 MiB on the wire. Reject up + // front before parsing any header or body fields — a conformant + // server stays under the cap, and the per-section bounds below + // (row count, dict heap, zstd content size) only act as + // defense-in-depth once we are inside the frame. + if len(payload) > qwpMaxBatchSize { + return newQwpDecodeError(fmt.Sprintf( + "RESULT_BATCH wire size %d exceeds protocol cap %d", + len(payload), qwpMaxBatchSize)) + } + msgKind, err := d.parseFrameHeader(payload) + if err != nil { + return err + } + if msgKind != qwpMsgKindResultBatch { + return newQwpDecodeError(fmt.Sprintf( + "expected RESULT_BATCH (0x11), got 0x%02X", byte(msgKind))) + } + requestId, err := d.br.readInt64LE() + if err != nil { + return err + } + batchSeq, err := d.br.readVarintInt63() + if err != nil { + return err + } + + // FLAG_ZSTD covers the region AFTER the batch prelude — i.e. the + // delta symbol section + table block + column data. The 12-byte + // header and (msg_kind + request_id + batch_seq) prelude stay + // uncompressed. Decompress into the per-batch scratch now, then + // rebind d.br to the plain bytes so the rest of the decoder sees + // exactly the layout it always has. + if d.zstdOn { + if err := d.decompressIntoBatch(out); err != nil { + return err + } + } + + if d.deltaOn { + if err := d.dict.appendDelta(&d.br); err != nil { + return err + } + } + + // Table block header: name_length varint, name bytes, row_count. + // col_count and the inline schema follow only on the first batch + // of a query (handled below); see the schema section. + nameLen, err := d.br.readVarintInt63() + if err != nil { + return err + } + if nameLen > qwpMaxTableNameLen { + return newQwpDecodeError(fmt.Sprintf( + "table name length out of range: %d", nameLen)) + } + if err := d.br.advance(int(nameLen)); err != nil { + return err + } + + rowCount64, err := d.br.readVarintInt63() + if err != nil { + return err + } + if rowCount64 > qwpMaxRowsPerBatch { + return newQwpDecodeError(fmt.Sprintf( + "row_count out of range: %d", rowCount64)) + } + rowCount := int(rowCount64) + + // Schema section. The first batch of a query (batch_seq == 0) + // carries col_count followed by the inline column definitions; + // the decoder parses them once and holds them in querySchema. + // Continuation batches (batch_seq > 0) drop both col_count and the + // columns from the wire and reuse the held schema. The dispatcher + // resets querySchema at the start of every query, so a continuation + // batch can only legitimately follow a batch_seq == 0 schema batch + // on the same query. + var columnCount int + var cols []qwpColumnSchemaInfo + if batchSeq == 0 { + var colCount64 int64 + colCount64, err = d.br.readVarintInt63() + if err != nil { + return err + } + if colCount64 > qwpMaxColumnsPerTable { + return newQwpDecodeError(fmt.Sprintf( + "column_count out of range: %d", colCount64)) + } + columnCount = int(colCount64) + cols, err = d.parseFullSchema(columnCount) + if err != nil { + return err + } + d.querySchema = cols + d.querySchemaValid = true + } else { + if !d.querySchemaValid { + return newQwpDecodeError( + "continuation RESULT_BATCH (batch_seq > 0) arrived before its schema batch") + } + cols = d.querySchema + columnCount = len(cols) + } + + // Bound the declared cell count (row_count × column_count) before the + // per-column loop sizes any scratch. The decoder materialises a + // row-indexed array — rowCount entries wide — for every column that + // carries nulls (nonNullIdx) and for every SYMBOL (symbolRowIds) and + // ARRAY (arrayRowStart + arrayElems) column, 4..12 bytes of heap per + // row. An all-null column is nearly free on the wire (a rowCount/8 + // null bitmap, zstd-compressible to almost nothing) yet forces that + // full rowCount-sized allocation; a frame packed with such columns up + // to the decompressed cap would drive multi-GiB transient make()s. + // row_count and column_count are each individually capped above, but + // their product is not — guard it here so amplified frames are + // rejected before any index array is allocated. The int64 product + // cannot overflow: both factors are non-negative and within caps that + // keep the product well under int64 max. + if int64(rowCount)*int64(columnCount) > qwpMaxCellsPerBatch { + return newQwpDecodeError(fmt.Sprintf( + "RESULT_BATCH cell count out of range: row_count %d × column_count %d exceeds cap %d", + rowCount, columnCount, int64(qwpMaxCellsPerBatch))) + } + + // Grow the batch's own layout pool to columnCount. Pool-owned + // slices are preserved so subsequent decodes into the SAME batch + // with the same column width don't reallocate — the I/O goroutine + // amortises across batches that reuse the same qwpBatchBuffer. + // + // Crucially, `out.layouts` lives on the batch, not on the decoder. + // Two batches whose buffers the I/O goroutine alternates between + // never share layout storage, so emitting batch N while decoding + // batch N+1 does not corrupt batch N's view. + if cap(out.layouts) < columnCount { + out.layouts = make([]qwpColumnLayout, columnCount) + } else { + out.layouts = out.layouts[:columnCount] + } + + // When FLAG_ZSTD was set, the per-column parse reads from the + // decompressed scratch (d.br was rebound above), so out.payload + // must point at the scratch — that is what the layout byte-slices + // alias. The non-zstd path keeps the original payload so the + // lifetime contract is unchanged. + if d.zstdOn { + out.payload = out.zstdScratch + } else { + out.payload = payload + } + out.requestId = requestId + out.batchSeq = batchSeq + out.rowCount = rowCount + out.columnCount = columnCount + out.columns = cols + + // Per-column parse + for i := 0; i < columnCount; i++ { + l := &out.layouts[i] + l.clear() + l.info = &cols[i] + if err := d.parseColumn(l, rowCount); err != nil { + return err + } + } + return nil +} + +// parseFullSchema reads full schema entries: per column, (colNameLen +// varint, name bytes, wireType byte). Decimal scale and geohash +// precision are NOT in the schema section — they are per-column and +// live in the data section. +func (d *qwpQueryDecoder) parseFullSchema(columnCount int) ([]qwpColumnSchemaInfo, error) { + // Use a fresh slice per call (rather than pooling). The slice is + // held in querySchema and reused across the query's continuation + // batches, and may also be aliased by a QwpColumnBatch the user + // still holds, so it must outlive this decode — reusing buffer + // pools here would corrupt those readers on the next batch. + cols := make([]qwpColumnSchemaInfo, columnCount) + for i := 0; i < columnCount; i++ { + nameLen64, err := d.br.readVarintInt63() + if err != nil { + return nil, err + } + if nameLen64 > qwpMaxColumnNameLen { + return nil, newQwpDecodeError(fmt.Sprintf( + "column name length out of range: %d", nameLen64)) + } + nameBytes, err := d.br.slice(int(nameLen64)) + if err != nil { + return nil, err + } + wireType, err := d.br.readByte() + if err != nil { + return nil, err + } + // Copy name: nameBytes aliases the payload, which becomes stale + // once the frame is recycled. Schema info is held in querySchema + // across the query's batches, so we need an owned string. + cols[i] = qwpColumnSchemaInfo{ + name: string(nameBytes), + wireType: qwpTypeCode(wireType), + } + } + return cols, nil +} + +// parseColumn dispatches per-column decoding by wire type. +func (d *qwpQueryDecoder) parseColumn(l *qwpColumnLayout, rowCount int) error { + if err := d.parseNullSection(l, rowCount); err != nil { + return err + } + wt := l.info.wireType + switch wt { + case qwpTypeBoolean: + bits := (l.nonNullCount + 7) >> 3 + s, err := d.br.slice(bits) + if err != nil { + return err + } + l.values = s + return nil + case qwpTypeByte: + return d.readFixed(l, 1) + case qwpTypeShort, qwpTypeChar: + return d.readFixed(l, 2) + case qwpTypeInt, qwpTypeFloat, qwpTypeIPv4: + return d.readFixed(l, 4) + case qwpTypeLong, qwpTypeDouble: + return d.readFixed(l, 8) + case qwpTypeTimestamp, qwpTypeTimestampNano, qwpTypeDate: + // DATE is asymmetric on the wire. The server's *egress* + // encoder (QwpResultBatchBuffer) frames DATE exactly like + // TIMESTAMP — a 1-byte encoding discriminator (0x00 raw + // int64 / 0x01 Gorilla) then the payload — even though the + // *ingestion* encoder (Java QwpColumnWriter, and our + // qwpEncoder) writes DATE as a plain int64. We decode + // egress frames here, so DATE must go through parseTimestamp; + // readFixed(8) would skip the discriminator and shift every + // value left by 8 bits. Do NOT "align" the ingestion encoder + // to this — it breaks DATE ingestion. The asymmetry is by + // protocol design; TestQwpIntegrationQwpOnlyTypes guards the + // ingestion side, the egress fuzz guards this side. + return d.parseTimestamp(l) + case qwpTypeUuid: + return d.readFixed(l, 16) + case qwpTypeLong256: + return d.readFixed(l, 32) + case qwpTypeDecimal64: + return d.parseDecimal(l, 8) + case qwpTypeDecimal128: + return d.parseDecimal(l, 16) + case qwpTypeDecimal256: + return d.parseDecimal(l, 32) + case qwpTypeVarchar, qwpTypeBinary: + return d.parseString(l) + case qwpTypeSymbol: + return d.parseSymbol(l, rowCount) + case qwpTypeGeohash: + return d.parseGeohash(l) + case qwpTypeDoubleArray, qwpTypeLongArray: + return d.parseArray(l, rowCount) + default: + return newQwpDecodeError(fmt.Sprintf( + "unsupported wire type 0x%02X", byte(wt))) + } +} + +// parseNullSection reads the null flag + optional bitmap. Non-zero +// flag means a bitmap follows; zero flag means no nulls (nonNullCount +// == rowCount, no per-row index materialisation needed). +func (d *qwpQueryDecoder) parseNullSection(l *qwpColumnLayout, rowCount int) error { + flag, err := d.br.readByte() + if err != nil { + return err + } + if flag == 0 { + l.nullBitmap = nil + l.nonNullIdx = l.nonNullIdx[:0] + l.nonNullCount = rowCount + return nil + } + bitmapLen := (rowCount + 7) >> 3 + bitmap, err := d.br.slice(bitmapLen) + if err != nil { + return err + } + l.nullBitmap = bitmap + // Grow nonNullIdx to rowCount (preserve backing array across + // batches — pool semantics from qwpColumnLayout.clear). + if cap(l.nonNullIdx) < rowCount { + l.nonNullIdx = make([]int32, rowCount) + } else { + l.nonNullIdx = l.nonNullIdx[:rowCount] + } + // Iterate one bitmap byte at a time (8 rows) so each byte is + // loaded once and the per-row `bitmap[i>>3]` bounds check is + // folded away. Fast paths for the common all-non-null and + // all-null bytes avoid the inner bit loop entirely. + idx := l.nonNullIdx + dense := int32(0) + fullBytes := rowCount >> 3 + for bi := 0; bi < fullBytes; bi++ { + bits := bitmap[bi] + base := bi << 3 + switch bits { + case 0x00: + idx[base] = dense + idx[base+1] = dense + 1 + idx[base+2] = dense + 2 + idx[base+3] = dense + 3 + idx[base+4] = dense + 4 + idx[base+5] = dense + 5 + idx[base+6] = dense + 6 + idx[base+7] = dense + 7 + dense += 8 + case 0xFF: + idx[base] = -1 + idx[base+1] = -1 + idx[base+2] = -1 + idx[base+3] = -1 + idx[base+4] = -1 + idx[base+5] = -1 + idx[base+6] = -1 + idx[base+7] = -1 + default: + for j := 0; j < 8; j++ { + if bits&(1< 0 { + if first := binary.LittleEndian.Uint32(offsets); first != 0 { + return newQwpDecodeError(fmt.Sprintf( + "invalid string column first offset: %d (expected 0)", first)) + } + total := uint32(totalBytes) + prev := uint32(0) + for i := 1; i <= l.nonNullCount; i++ { + off := binary.LittleEndian.Uint32(offsets[i*4:]) + if off < prev || off > total { + return newQwpDecodeError(fmt.Sprintf( + "invalid string column offset at index %d: %d (prev=%d, total=%d)", + i, off, prev, total)) + } + prev = off + } + } + stringBytes, err := d.br.slice(int(totalBytes)) + if err != nil { + return err + } + l.values = offsets + l.stringBytes = stringBytes + return nil +} + +// parseSymbol reads one varint dictionary id per non-null row and +// snapshots the connection-scoped dict so the resulting column layout +// resolves ids against the dict state at decode time (not read time — +// subsequent batches may grow the dict). +func (d *qwpQueryDecoder) parseSymbol(l *qwpColumnLayout, rowCount int) error { + if !d.deltaOn { + // Phase 1 server always sets FLAG_DELTA_SYMBOL_DICT. A frame + // without it would require a per-column dictionary path we + // haven't implemented — refuse cleanly rather than mis-parse. + return newQwpDecodeError( + "SYMBOL column without FLAG_DELTA_SYMBOL_DICT is not supported") + } + l.symbolDict = d.dict.snapshot() + + // Size symbolRowIds to rowCount; NULL rows hold undefined values + // (accessors null-check first). + if cap(l.symbolRowIds) < rowCount { + l.symbolRowIds = make([]int32, rowCount) + } else { + l.symbolRowIds = l.symbolRowIds[:rowCount] + } + dictSize := uint64(len(l.symbolDict.entries)) + noNulls := l.nullBitmap == nil + // Hoist the byte buffer + position into locals: symbol-heavy result + // sets visit this loop once per non-null row, and going through the + // readVarintInt63 / qwpReadVarint call boundary on every iteration + // blocks inlining of what's otherwise a one-byte fast path. + buf := d.br.buf + bufLen := len(buf) + pos := d.br.pos + for i := 0; i < rowCount; i++ { + if !noNulls && l.nonNullIdx[i] < 0 { + continue + } + var id uint64 + if pos < bufLen && buf[pos] < 0x80 { + // Fast path: single-byte varint (id < 128). Covers typical + // categorical columns where the dictionary is small. + id = uint64(buf[pos]) + pos++ + } else { + // Cold path: multi-byte varint, EOF, or overflow. Sync pos + // back to the reader and let it produce the wrapped error. + d.br.pos = pos + v, err := d.br.readVarintInt63() + if err != nil { + return err + } + pos = d.br.pos + id = uint64(v) + } + if id >= dictSize { + d.br.pos = pos + return newQwpDecodeError(fmt.Sprintf( + "symbol index out of range: %d", id)) + } + l.symbolRowIds[i] = int32(id) + } + d.br.pos = pos + return nil +} + +// parseGeohash reads the precision varint and per-row packed bits. +func (d *qwpQueryDecoder) parseGeohash(l *qwpColumnLayout) error { + precBits64, err := d.br.readVarintInt63() + if err != nil { + return err + } + // The server enforces [1, 60] on GEOLONG precision; mirror the check + // here so a varint that decodes out of range fails fast rather than + // driving a nonsense bytesPerValue into the length calculation below. + // Matches QwpResultBatchDecoder.java. + if precBits64 < 1 || precBits64 > 60 { + return newQwpDecodeError(fmt.Sprintf( + "geohash precision out of range [1, 60]: %d", precBits64)) + } + l.precisionBits = uint16(precBits64) + bytesPerValue := int((precBits64 + 7) / 8) + return d.readFixed(l, bytesPerValue) +} + +// parseArray reads per-row array entries (skipping NULL rows flagged +// in the null bitmap) and bookkeeps (start, length) into layout.values +// for each row. The values slice is set to alias the entire array-data +// region of the payload so accessors can address elements by +// (row-start + offset). +// +// The server encodes a NULL array via the null bitmap, never inline, +// so a non-null row must carry nDims >= 1. An inline nDims of 0 is +// rejected as a malformed frame. +func (d *qwpQueryDecoder) parseArray(l *qwpColumnLayout, rowCount int) error { + base := d.br.pos + if cap(l.arrayRowStart) < rowCount { + l.arrayRowStart = make([]int32, rowCount) + } else { + l.arrayRowStart = l.arrayRowStart[:rowCount] + } + if cap(l.arrayElems) < rowCount { + l.arrayElems = make([]int32, rowCount) + } else { + l.arrayElems = l.arrayElems[:rowCount] + } + noNulls := l.nullBitmap == nil + for i := 0; i < rowCount; i++ { + if !noNulls && l.nonNullIdx[i] < 0 { + l.arrayRowStart[i] = 0 + l.arrayElems[i] = 0 + continue + } + rowStart := d.br.pos + nDimsByte, err := d.br.readByte() + if err != nil { + return err + } + nDims := int(nDimsByte) + if nDims < 1 || nDims > qwpMaxArrayNDims { + return newQwpDecodeError(fmt.Sprintf( + "ARRAY nDims out of range [1, %d]: %d", qwpMaxArrayNDims, nDims)) + } + shapeBytes, err := d.br.slice(4 * nDims) + if err != nil { + return err + } + elements := int64(1) + for dim := 0; dim < nDims; dim++ { + dl := int32(binary.LittleEndian.Uint32(shapeBytes[dim*4:])) + // Require dl >= 1 in every dimension. A dl of 0 would zero out + // elements and short-circuit the qwpMaxArrayElements cap for + // the remaining dimensions, letting them hold arbitrary values + // unchecked; the encoder never emits dl == 0. Matches + // QwpResultBatchDecoder.java. + if dl < 1 { + return newQwpDecodeError(fmt.Sprintf( + "ARRAY dim %d must be >= 1: %d", dim, dl)) + } + elements *= int64(dl) + if elements > qwpMaxArrayElements { + return newQwpDecodeError(fmt.Sprintf( + "ARRAY element count exceeds limit (%d > %d)", + elements, qwpMaxArrayElements)) + } + } + if err := d.br.advance(int(elements) * 8); err != nil { + return err + } + l.arrayRowStart[i] = int32(rowStart - base) + l.arrayElems[i] = int32(elements) + } + // values slice covers the entire array region read above. + l.values = d.br.buf[base:d.br.pos] + return nil +} + +// qwpPeekMsgKind returns the msg_kind byte at offset qwpHeaderSize of +// payload without validating magic, version, or flags. Used by the I/O +// goroutine's dispatch loop to pick the right per-kind decoder method; +// the chosen method re-runs parseFrameHeader for the full validation. +// +// Cheaper than reparsing the whole header twice — but still bounds-checks +// the payload so a truncated frame cannot panic the dispatch site. +func qwpPeekMsgKind(payload []byte) (qwpMsgKind, error) { + if len(payload) < qwpHeaderSize+1 { + return 0, newQwpDecodeError(fmt.Sprintf( + "frame payload too short for msg_kind peek: %d", len(payload))) + } + return qwpMsgKind(payload[qwpHeaderSize]), nil +} + +// parseFrameHeader validates the 12-byte QWP header, primes d.br to the +// frame body, reads the msg_kind byte, and returns it. Sets d.deltaOn / +// d.gorillaOn / d.zstdOn from the flags byte. +// +// FLAG_ZSTD is only meaningful on RESULT_BATCH — the other per-kind +// decoders reject d.zstdOn themselves. The flag has to be tracked here +// (not in decode) so the rejection can share the validated-header +// path. +// +// Shared by every per-kind decoder (decode / decodeResultEnd / +// decodeQueryError / decodeExecDone) so header validation stays uniform. +func (d *qwpQueryDecoder) parseFrameHeader(payload []byte) (qwpMsgKind, error) { + if len(payload) < qwpHeaderSize+1 { + return 0, newQwpDecodeError(fmt.Sprintf( + "frame payload too short: %d", len(payload))) + } + magic := binary.LittleEndian.Uint32(payload[0:4]) + if magic != qwpMagic { + return 0, newQwpDecodeError(fmt.Sprintf("bad magic 0x%08X", magic)) + } + if payload[4] != d.negotiatedVersion { + return 0, newQwpDecodeError(fmt.Sprintf( + "frame version %d does not match negotiated version %d", + payload[4], d.negotiatedVersion)) + } + // payload_length is the header's own count of the bytes that follow + // it; the encoder patches it to (frame size − header size). A + // mismatch against the actual body we received means the framing is + // desynced — reject it here rather than decode a frame whose length + // the server and we disagree on. + declaredPayloadLen := binary.LittleEndian.Uint32( + payload[qwpHeaderOffsetPayloadLen : qwpHeaderOffsetPayloadLen+4]) + if int64(declaredPayloadLen) != int64(len(payload)-qwpHeaderSize) { + return 0, newQwpDecodeError(fmt.Sprintf( + "frame payload_length %d does not match body size %d", + declaredPayloadLen, len(payload)-qwpHeaderSize)) + } + flags := payload[qwpHeaderOffsetFlags] + d.deltaOn = flags&qwpFlagDeltaSymbolDict != 0 + d.gorillaOn = flags&qwpFlagGorilla != 0 + d.zstdOn = flags&qwpFlagZstd != 0 + tableCount := binary.LittleEndian.Uint16( + payload[qwpHeaderOffsetTableCount : qwpHeaderOffsetTableCount+2]) + d.br.reset(payload[qwpHeaderSize:]) + kindByte, err := d.br.readByte() + if err != nil { + return 0, err + } + msgKind := qwpMsgKind(kindByte) + // Spec §4: table_count is 1 for RESULT_BATCH and 0 for every other + // kind. Reject mismatches up front so a malformed server cannot + // smuggle ambiguous framing past the per-kind decoders. + expectedTableCount := uint16(0) + if msgKind == qwpMsgKindResultBatch { + expectedTableCount = 1 + } + if tableCount != expectedTableCount { + return 0, newQwpDecodeError(fmt.Sprintf( + "frame table_count = %d, expected %d for msg_kind 0x%02X", + tableCount, expectedTableCount, byte(msgKind))) + } + return msgKind, nil +} + +// decodeResultEnd parses a RESULT_END (0x12) frame. The frame announces +// the end of a streaming query and carries the server-reported total +// row count. +// +// Wire layout (after the 12-byte header): +// +// msg_kind(1) + request_id(int64 LE) + final_seq(varint) + total_rows(varint) +// +// final_seq is currently unused by this client — it matches the last +// batch's seq and is already tracked by the I/O layer. It is still +// consumed so the cursor is aligned when reading total_rows. +func (d *qwpQueryDecoder) decodeResultEnd(payload []byte) (requestId int64, totalRows int64, err error) { + msgKind, err := d.parseFrameHeader(payload) + if err != nil { + return 0, 0, err + } + if msgKind != qwpMsgKindResultEnd { + return 0, 0, newQwpDecodeError(fmt.Sprintf( + "expected RESULT_END (0x12), got 0x%02X", byte(msgKind))) + } + if d.zstdOn { + return 0, 0, newQwpDecodeError( + "FLAG_ZSTD set on non-RESULT_BATCH frame (RESULT_END)") + } + requestId, err = d.br.readInt64LE() + if err != nil { + return 0, 0, err + } + // final_seq: read and discard. readVarint already rejects + // overflowing 10-byte sequences, matching the Java guard. + if _, err = d.br.readVarint(); err != nil { + return 0, 0, err + } + totalRows, err = d.br.readVarintInt63() + if err != nil { + return 0, 0, err + } + return requestId, totalRows, nil +} + +// decodeQueryError parses a QUERY_ERROR (0x13) frame. The returned +// QwpQueryError carries the server's status byte and UTF-8 message. +// +// Wire layout (after the 12-byte header): +// +// msg_kind(1) + request_id(int64 LE) + status(1) + msg_len(uint16 LE) + message(msg_len UTF-8 bytes) +// +// msg_len is treated as unsigned (range 0..65535); the qwpByteReader.slice +// call below rejects a msg_len that overruns the frame — this is the +// port of Java's "msg_len ... exceeds frame remainder" hardening guard. +func (d *qwpQueryDecoder) decodeQueryError(payload []byte) (*QwpQueryError, error) { + msgKind, err := d.parseFrameHeader(payload) + if err != nil { + return nil, err + } + if msgKind != qwpMsgKindQueryError { + return nil, newQwpDecodeError(fmt.Sprintf( + "expected QUERY_ERROR (0x13), got 0x%02X", byte(msgKind))) + } + if d.zstdOn { + return nil, newQwpDecodeError( + "FLAG_ZSTD set on non-RESULT_BATCH frame (QUERY_ERROR)") + } + requestId, err := d.br.readInt64LE() + if err != nil { + return nil, err + } + status, err := d.br.readByte() + if err != nil { + return nil, err + } + msgLen, err := d.br.readUint16LE() + if err != nil { + return nil, err + } + msgBytes, err := d.br.slice(int(msgLen)) + if err != nil { + return nil, wrapQwpDecodeError(fmt.Sprintf( + "QUERY_ERROR msg_len %d exceeds frame remainder", msgLen), err) + } + return &QwpQueryError{ + RequestId: requestId, + Status: QwpStatusCode(status), + // Copy: msgBytes aliases the payload, which is reclaimed once + // the I/O goroutine advances past the frame. QwpQueryError is + // surfaced to the user and outlives the frame. + Message: string(msgBytes), + }, nil +} + +// decodeExecDone parses an EXEC_DONE (0x16) frame — the terminal ack +// for a non-SELECT statement. +// +// Wire layout (after the 12-byte header): +// +// msg_kind(1) + request_id(int64 LE) + op_type(1) + rows_affected(varint) +func (d *qwpQueryDecoder) decodeExecDone(payload []byte) (requestId int64, result ExecResult, err error) { + msgKind, err := d.parseFrameHeader(payload) + if err != nil { + return 0, ExecResult{}, err + } + if msgKind != qwpMsgKindExecDone { + return 0, ExecResult{}, newQwpDecodeError(fmt.Sprintf( + "expected EXEC_DONE (0x16), got 0x%02X", byte(msgKind))) + } + if d.zstdOn { + return 0, ExecResult{}, newQwpDecodeError( + "FLAG_ZSTD set on non-RESULT_BATCH frame (EXEC_DONE)") + } + requestId, err = d.br.readInt64LE() + if err != nil { + return 0, ExecResult{}, err + } + opType, err := d.br.readByte() + if err != nil { + return 0, ExecResult{}, err + } + rowsAffected, err := d.br.readVarintInt63() + if err != nil { + return 0, ExecResult{}, err + } + return requestId, ExecResult{ + OpType: opType, + RowsAffected: rowsAffected, + }, nil +} + +// decodeCacheReset parses a CACHE_RESET (0x17) frame and returns its +// reset_mask byte. The frame has no request_id — it is a connection- +// scoped notification, not a per-query reply. Invalid zstd flag is +// rejected with the same policy as the other non-RESULT_BATCH +// decoders so a server that sets FLAG_ZSTD on a control frame is +// caught before any downstream work. +// +// Wire layout (after the 12-byte header): +// +// msg_kind(1) + reset_mask(1) +func (d *qwpQueryDecoder) decodeCacheReset(payload []byte) (byte, error) { + msgKind, err := d.parseFrameHeader(payload) + if err != nil { + return 0, err + } + if msgKind != qwpMsgKindCacheReset { + return 0, newQwpDecodeError(fmt.Sprintf( + "expected CACHE_RESET (0x17), got 0x%02X", byte(msgKind))) + } + if d.zstdOn { + return 0, newQwpDecodeError( + "FLAG_ZSTD set on non-RESULT_BATCH frame (CACHE_RESET)") + } + mask, err := d.br.readByte() + if err != nil { + return 0, wrapQwpDecodeError("CACHE_RESET truncated before reset_mask", err) + } + return mask, nil +} + +// applyCacheReset drops the connection-scoped caches indicated by +// mask. Currently only qwpResetMaskDict is defined: it discards the +// SYMBOL dict so the next RESULT_BATCH's deltaStart lines up with the +// server's fresh counter. Bits the server does not set are preserved. +func (d *qwpQueryDecoder) applyCacheReset(mask byte) { + if mask&qwpResetMaskDict != 0 { + d.dict.clear() + } +} + +// decompressIntoBatch decompresses the remaining d.br bytes (the zstd +// frame covering the delta section + table block) into out.zstdScratch +// and rebinds d.br onto the decompressed bytes. The caller must have +// already validated d.zstdOn and consumed the uncompressed prelude +// (msg_kind + request_id + batch_seq) — only the region from there to +// the end of the payload is a single zstd frame, per Java +// QwpResultBatchDecoder.decodeBatch. +// +// The scratch is pre-sized from the zstd frame header's content-size +// field. Unknown content size is treated as a protocol violation — +// the server calls the one-shot Zstd.compress API, which leaves +// ZSTD_c_contentSizeFlag at its default (on), so every server-emitted +// frame declares its content size (see Java QwpResultBatchDecoder +// line 302-307 for the same contract). A content size that exceeds +// qwpZstdMaxDecompressedSize is rejected up front rather than driving +// unbounded scratch growth. +func (d *qwpQueryDecoder) decompressIntoBatch(out *QwpColumnBatch) error { + compressed, err := d.br.slice(d.br.remaining()) + if err != nil { + return err + } + if len(compressed) == 0 { + return newQwpDecodeError( + "FLAG_ZSTD set but no compressed payload follows the prelude") + } + var hdr zstd.Header + if err := hdr.Decode(compressed); err != nil { + return wrapQwpDecodeError("invalid zstd frame header", err) + } + if !hdr.HasFCS { + return newQwpDecodeError( + "zstd frame missing content size (protocol violation)") + } + if hdr.FrameContentSize > uint64(qwpZstdMaxDecompressedSize) { + return newQwpDecodeError(fmt.Sprintf( + "zstd frame content size %d exceeds client cap %d", + hdr.FrameContentSize, qwpZstdMaxDecompressedSize)) + } + expected := int(hdr.FrameContentSize) + + // Grow the per-batch scratch in one shot. Start at qwpZstdMinScratchGrow + // so a burst of small batches does not re-alloc on every frame; doubling + // when we exceed the current capacity follows the Java MIN/MAX_SCRATCH + // shape. Clamp to qwpZstdMaxDecompressedSize so doubling from a current + // cap > 32 MiB cannot allocate past the cap — expected is already known + // to fit under it from the check above. + if cap(out.zstdScratch) < expected { + newCap := cap(out.zstdScratch) * 2 + if newCap < expected { + newCap = expected + } + if newCap > qwpZstdMaxDecompressedSize { + newCap = qwpZstdMaxDecompressedSize + } + if newCap < qwpZstdMinScratchGrow { + newCap = qwpZstdMinScratchGrow + } + out.zstdScratch = make([]byte, 0, newCap) + } else { + out.zstdScratch = out.zstdScratch[:0] + } + + if d.zstdDec == nil { + dec, err := zstd.NewReader(nil, + zstd.WithDecoderConcurrency(1), + zstd.WithDecoderMaxMemory(uint64(qwpZstdMaxDecompressedSize)), + ) + if err != nil { + return wrapQwpDecodeError("zstd decoder init failed", err) + } + d.zstdDec = dec + } + decoded, err := d.zstdDec.DecodeAll(compressed, out.zstdScratch) + if err != nil { + return wrapQwpDecodeError("zstd decompression failed", err) + } + out.zstdScratch = decoded + d.br.reset(decoded) + return nil +} + +// int64sAsBytes reinterprets an []int64 as []byte (len*8, cap*8) +// without copying. Used by parseTimestamp to make the Gorilla-decoded +// values region look identical to a raw int64 LE region, so the +// QwpColumnBatch.Int64 accessor path stays uniform. +// +// Safety: int64 is 8 bytes on every supported architecture and Go +// stores them little-endian on all targets questdb-client supports. +// unsafe.Slice is the canonical way to do this reinterpretation since +// Go 1.17. +func int64sAsBytes(s []int64) []byte { + if len(s) == 0 { + return nil + } + return unsafe.Slice((*byte)(unsafe.Pointer(&s[0])), len(s)*8) +} diff --git a/qwp_query_decoder_test.go b/qwp_query_decoder_test.go new file mode 100644 index 00000000..9730c628 --- /dev/null +++ b/qwp_query_decoder_test.go @@ -0,0 +1,2881 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "bytes" + "encoding/binary" + "math" + "strings" + "testing" + + "github.com/klauspost/compress/zstd" +) + +// --- Test helpers --- + +// wrapAsResultBatch takes an ingress-style frame (header + delta-dict +// + table block, as produced by qwpEncoder.encodeTable) and splices in +// the egress-style prelude (msg_kind + request_id + batch_seq) between +// the header and the delta dict. The header's payloadLength field and +// tableCount are preserved; only the bytes between are rewritten. +// +// Ingress layout: +// +// [12 header][deltaDict][tableBlock] +// +// Egress RESULT_BATCH layout: +// +// [12 header][msg_kind:1][requestId:8][batchSeq:varint][deltaDict][tableBlock] +// +// payload length must be patched to the new body length. +func wrapAsResultBatch(ingress []byte, requestId int64, batchSeq uint64) []byte { + if len(ingress) < qwpHeaderSize { + panic("ingress frame too short to wrap") + } + header := ingress[:qwpHeaderSize] + body := ingress[qwpHeaderSize:] + + // A continuation RESULT_BATCH (batch_seq > 0) carries no col_count + // and no inline column schema — the decoder reuses the schema parsed + // from batch 0. The ingress encoder always writes them, so strip the + // schema section here to mirror a real continuation frame. + if batchSeq > 0 { + body = stripContinuationSchema(body) + } + + var prelude bytes.Buffer + prelude.WriteByte(byte(qwpMsgKindResultBatch)) + var reqBuf [8]byte + binary.LittleEndian.PutUint64(reqBuf[:], uint64(requestId)) + prelude.Write(reqBuf[:]) + varBuf := make([]byte, qwpMaxVarintLen) + n := qwpPutVarint(varBuf, batchSeq) + prelude.Write(varBuf[:n]) + + out := make([]byte, 0, qwpHeaderSize+prelude.Len()+len(body)) + out = append(out, header...) + out = append(out, prelude.Bytes()...) + out = append(out, body...) + // Patch payload length (offset 8..12). + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +// stripContinuationSchema removes the col_count + inline column schema +// from an ingress-encoded body (deltaDict followed by a table block), +// leaving deltaDict + table_name + row_count + column data — the shape +// a real continuation RESULT_BATCH (batch_seq > 0) carries on the wire. +// The ingress encoder always emits the schema; this drops it so a +// wrapped continuation frame matches what the server would actually +// send for batch_seq > 0. +func stripContinuationSchema(body []byte) []byte { + var r qwpByteReader + r.reset(body) + mustVarint := func(what string) int64 { + v, err := r.readVarintInt63() + if err != nil { + panic("stripContinuationSchema: " + what + ": " + err.Error()) + } + return v + } + mustAdvance := func(n int, what string) { + if err := r.advance(n); err != nil { + panic("stripContinuationSchema: " + what + ": " + err.Error()) + } + } + // Delta dict: deltaStart, deltaCount, then deltaCount strings. + mustVarint("deltaStart") + deltaCount := mustVarint("deltaCount") + for i := int64(0); i < deltaCount; i++ { + mustAdvance(int(mustVarint("dict string len")), "dict string") + } + // Table block prefix kept on every batch: table_name, row_count. + mustAdvance(int(mustVarint("table name len")), "table name") + mustVarint("row_count") + schemaStart := r.pos + // Schema section dropped on continuation batches: col_count, then + // per column a name (varint len + bytes) and a 1-byte type code. + colCount := mustVarint("col_count") + for i := int64(0); i < colCount; i++ { + mustAdvance(int(mustVarint("col name len")), "col name") + if _, err := r.readByte(); err != nil { + panic("stripContinuationSchema: type code: " + err.Error()) + } + } + colDataStart := r.pos + out := make([]byte, 0, schemaStart+(len(body)-colDataStart)) + out = append(out, body[:schemaStart]...) + out = append(out, body[colDataStart:]...) + return out +} + +// newTestQueryDecoder returns a zero-valued decoder seeded with the +// negotiated version every test fixture stamps into its frames +// (qwpVersion = 1). Production code sets this field via +// qwpEgressIO.start; tests construct decoders directly. +func newTestQueryDecoder() qwpQueryDecoder { + return qwpQueryDecoder{negotiatedVersion: qwpVersion} +} + +// encodeSingleColumnBatch is a convenience that builds a one-column +// table, populates it via the supplied per-row callbacks, and wraps +// the output as a RESULT_BATCH frame. Each entry in `rows` is called +// for one row; the helper calls tb.commitRow() after each. +func encodeSingleColumnBatch( + t *testing.T, + name string, + typeCode qwpTypeCode, + nullable bool, + rows []func(col *qwpColumnBuffer), +) []byte { + t.Helper() + tb := newQwpTableBuffer("t") + for _, populate := range rows { + col, err := tb.getOrCreateColumn(name, typeCode, nullable) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + populate(col) + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTable(tb) + return wrapAsResultBatch(ingress, 1, 0) +} + +// --- Positive-path round trips (driven by the real encoder) --- + +func TestQwpDecoderRoundTripFixedWidth(t *testing.T) { + type testCase struct { + name string + wt qwpTypeCode + rows []func(col *qwpColumnBuffer) + check func(t *testing.T, b *QwpColumnBatch) + } + cases := []testCase{ + { + name: "LONG", wt: qwpTypeLong, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addLong(1) }, + func(c *qwpColumnBuffer) { c.addLong(-2) }, + func(c *qwpColumnBuffer) { c.addLong(math.MaxInt64) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []int64{1, -2, math.MaxInt64} { + if got := b.Int64(0, i); got != w { + t.Fatalf("Int64[%d] = %d, want %d", i, got, w) + } + } + }, + }, + { + name: "DOUBLE", wt: qwpTypeDouble, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addDouble(1.3) }, + func(c *qwpColumnBuffer) { c.addDouble(-2.5) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []float64{1.3, -2.5} { + if got := b.Float64(0, i); got != w { + t.Fatalf("Float64[%d] = %v, want %v", i, got, w) + } + } + }, + }, + { + name: "INT", wt: qwpTypeInt, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addInt32(-7) }, + func(c *qwpColumnBuffer) { c.addInt32(100_000) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []int32{-7, 100_000} { + if got := b.Int32(0, i); got != w { + t.Fatalf("Int32[%d] = %d, want %d", i, got, w) + } + } + }, + }, + { + name: "BOOLEAN", wt: qwpTypeBoolean, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addBool(true) }, + func(c *qwpColumnBuffer) { c.addBool(false) }, + func(c *qwpColumnBuffer) { c.addBool(true) }, + func(c *qwpColumnBuffer) { c.addBool(false) }, + func(c *qwpColumnBuffer) { c.addBool(false) }, + func(c *qwpColumnBuffer) { c.addBool(true) }, + func(c *qwpColumnBuffer) { c.addBool(true) }, + func(c *qwpColumnBuffer) { c.addBool(false) }, + func(c *qwpColumnBuffer) { c.addBool(true) }, + func(c *qwpColumnBuffer) { c.addBool(false) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + // 10 booleans cross a byte boundary in the bit-packed + // wire payload (8 bits/byte). The decoder must walk + // across both bytes. + want := []bool{true, false, true, false, false, true, true, false, true, false} + for i, w := range want { + if got := b.Bool(0, i); got != w { + t.Fatalf("Bool[%d] = %v, want %v", i, got, w) + } + } + }, + }, + { + name: "BYTE", wt: qwpTypeByte, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addByte(math.MinInt8) }, + func(c *qwpColumnBuffer) { c.addByte(-1) }, + func(c *qwpColumnBuffer) { c.addByte(0) }, + func(c *qwpColumnBuffer) { c.addByte(7) }, + func(c *qwpColumnBuffer) { c.addByte(math.MaxInt8) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []int8{math.MinInt8, -1, 0, 7, math.MaxInt8} { + if got := b.Int8(0, i); got != w { + t.Fatalf("Int8[%d] = %d, want %d", i, got, w) + } + } + }, + }, + { + name: "SHORT", wt: qwpTypeShort, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addShort(math.MinInt16) }, + func(c *qwpColumnBuffer) { c.addShort(-1) }, + func(c *qwpColumnBuffer) { c.addShort(0) }, + func(c *qwpColumnBuffer) { c.addShort(42) }, + func(c *qwpColumnBuffer) { c.addShort(math.MaxInt16) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []int16{math.MinInt16, -1, 0, 42, math.MaxInt16} { + if got := b.Int16(0, i); got != w { + t.Fatalf("Int16[%d] = %d, want %d", i, got, w) + } + } + }, + }, + { + name: "CHAR", wt: qwpTypeChar, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addChar('a') }, + func(c *qwpColumnBuffer) { c.addChar('Z') }, + func(c *qwpColumnBuffer) { c.addChar('0') }, + func(c *qwpColumnBuffer) { c.addChar(' ') }, + // Highest BMP code point — pins the LE 2-byte + // reassembly path against off-by-one shifts in the + // decoder. + func(c *qwpColumnBuffer) { c.addChar(0xFFFE) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + for i, w := range []rune{'a', 'Z', '0', ' ', 0xFFFE} { + if got := b.Char(0, i); got != w { + t.Fatalf("Char[%d] = %U, want %U", i, got, w) + } + } + }, + }, + { + name: "FLOAT", wt: qwpTypeFloat, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addFloat32(float32(math.Inf(-1))) }, + func(c *qwpColumnBuffer) { c.addFloat32(-1.5) }, + func(c *qwpColumnBuffer) { c.addFloat32(0) }, + func(c *qwpColumnBuffer) { c.addFloat32(1.5) }, + func(c *qwpColumnBuffer) { c.addFloat32(float32(math.Inf(1))) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + want := []float32{ + float32(math.Inf(-1)), -1.5, 0, 1.5, float32(math.Inf(1)), + } + for i, w := range want { + if got := b.Float32(0, i); got != w { + t.Fatalf("Float32[%d] = %v, want %v", i, got, w) + } + } + }, + }, + // DATE has no Go-encode <-> Go-decode round trip: ingestion + // frames DATE as plain int64 but egress frames it timestamp-ish + // (protocol asymmetry). Egress DATE decode is covered by + // TestQwpDecoderEgressDate; ingestion by TestQwpIntegrationQwpOnlyTypes. + { + name: "TIMESTAMP_NANO", wt: qwpTypeTimestampNano, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addTimestamp(1_700_000_000_000_000_000) }, + func(c *qwpColumnBuffer) { c.addTimestamp(1_700_000_000_000_000_001) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + want := []int64{1_700_000_000_000_000_000, 1_700_000_000_000_000_001} + for i, w := range want { + if got := b.Int64(0, i); got != w { + t.Fatalf("TsNano Int64[%d] = %d, want %d", i, got, w) + } + } + }, + }, + { + name: "UUID", wt: qwpTypeUuid, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { + c.addUuid(0x99AABBCCDDEEFF00, 0x1122334455667788) + }, + func(c *qwpColumnBuffer) { c.addUuid(0, 0) }, + func(c *qwpColumnBuffer) { + c.addUuid(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF) + }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + type uuidPair struct{ hi, lo uint64 } + want := []uuidPair{ + {0x99AABBCCDDEEFF00, 0x1122334455667788}, + {0, 0}, + {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF}, + } + for i, w := range want { + if got := uint64(b.UuidLo(0, i)); got != w.lo { + t.Fatalf("UuidLo[%d] = %#x, want %#x", i, got, w.lo) + } + if got := uint64(b.UuidHi(0, i)); got != w.hi { + t.Fatalf("UuidHi[%d] = %#x, want %#x", i, got, w.hi) + } + } + }, + }, + { + name: "LONG256", wt: qwpTypeLong256, + rows: []func(col *qwpColumnBuffer){ + func(c *qwpColumnBuffer) { + c.addLong256(0x1111111111111111, 0x2222222222222222, + 0x3333333333333333, 0x4444444444444444) + }, + func(c *qwpColumnBuffer) { c.addLong256(0, 0, 0, 0) }, + }, + check: func(t *testing.T, b *QwpColumnBatch) { + want := [][4]uint64{ + {0x1111111111111111, 0x2222222222222222, + 0x3333333333333333, 0x4444444444444444}, + {0, 0, 0, 0}, + } + for i, row := range want { + for w := 0; w < 4; w++ { + if got := uint64(b.Long256Word(0, i, w)); got != row[w] { + t.Fatalf("Long256[%d].word[%d] = %#x, want %#x", + i, w, got, row[w]) + } + } + } + }, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + frame := encodeSingleColumnBatch(t, "c", c.wt, false, c.rows) + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame, &batch); err != nil { + t.Fatalf("decode: %v", err) + } + if batch.RowCount() != len(c.rows) { + t.Fatalf("RowCount = %d, want %d", batch.RowCount(), len(c.rows)) + } + c.check(t, &batch) + }) + } +} + +func TestQwpDecoderRoundTripNullable(t *testing.T) { + // Long column with pattern V N V N V (3 non-null, 2 null). + frame := encodeSingleColumnBatch(t, "l", qwpTypeLong, true, []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addLong(10) }, + func(c *qwpColumnBuffer) { c.addNull() }, + func(c *qwpColumnBuffer) { c.addLong(20) }, + func(c *qwpColumnBuffer) { c.addNull() }, + func(c *qwpColumnBuffer) { c.addLong(30) }, + }) + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame, &batch); err != nil { + t.Fatalf("decode: %v", err) + } + if batch.RowCount() != 5 { + t.Fatalf("RowCount = %d", batch.RowCount()) + } + for i, want := range []int64{10, 0, 20, 0, 30} { + if got := batch.Int64(0, i); got != want { + t.Fatalf("Int64[%d] = %d, want %d (null=%v)", i, got, want, batch.IsNull(0, i)) + } + } + for _, i := range []int{1, 3} { + if !batch.IsNull(0, i) { + t.Fatalf("row %d should be NULL", i) + } + } + if batch.NonNullCount(0) != 3 { + t.Fatalf("NonNullCount = %d", batch.NonNullCount(0)) + } +} + +func TestQwpDecoderRoundTripVarcharAndBinary(t *testing.T) { + // Go encoder supports VARCHAR via addString; BINARY is read-only + // from the server side and has no encoder path in this client, so + // the VARCHAR test exercises the shared offsets + bytes layout + // used by both types. + for _, wt := range []qwpTypeCode{qwpTypeVarchar} { + t.Run(typeCodeName(wt), func(t *testing.T) { + frame := encodeSingleColumnBatch(t, "v", wt, false, []func(*qwpColumnBuffer){ + func(c *qwpColumnBuffer) { c.addString("") }, + func(c *qwpColumnBuffer) { c.addString("hello") }, + func(c *qwpColumnBuffer) { c.addString("日本語") }, + func(c *qwpColumnBuffer) { c.addString("x") }, + }) + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame, &batch); err != nil { + t.Fatalf("decode: %v", err) + } + want := []string{"", "hello", "日本語", "x"} + for i, w := range want { + if got := batch.String(0, i); got != w { + t.Fatalf("String[%d] = %q, want %q", i, got, w) + } + } + }) + } +} + +// patchSchemaTypeToDate rewrites the schema type code of column colName +// in a raw qwpEncoder.encodeTable() payload (BEFORE wrapAsResultBatch) +// to qwpTypeDate. DATE shares TIMESTAMP's *egress* framing (1-byte +// encoding discriminator + RAW/Gorilla), so encoding the column as +// TIMESTAMP and relabelling the schema yields byte-for-byte what the +// server's QwpResultBatchBuffer emits for a DATE column. The ingestion +// encoder cannot synthesise egress DATE directly (it writes plain +// int64, by protocol asymmetry). Offsets mirror the proven walk in +// TestQwpEncoderAllFixedTypes (raw encodeTable layout: qwpHeaderSize +// header + 2-byte empty delta symbol dict, then name / counts / schema). +func patchSchemaTypeToDate(t *testing.T, ingress []byte, colName string) { + t.Helper() + off := qwpHeaderSize + 2 // header + empty delta symbol dict (2 bytes) + nameLen, n, err := qwpReadVarint(ingress[off:]) + if err != nil { + t.Fatalf("table-name varint: %v", err) + } + off += n + int(nameLen) // table name + if _, n, err = qwpReadVarint(ingress[off:]); err != nil { + t.Fatalf("rowCount varint: %v", err) + } + off += n // rowCount + colCount, n, err := qwpReadVarint(ingress[off:]) + if err != nil { + t.Fatalf("colCount varint: %v", err) + } + off += n + for i := 0; i < int(colCount); i++ { + cnLen, n, err := qwpReadVarint(ingress[off:]) + if err != nil { + t.Fatalf("col-name varint: %v", err) + } + off += n + name := string(ingress[off : off+int(cnLen)]) + off += int(cnLen) + if name == colName { + ingress[off] = byte(qwpTypeDate) + return + } + off++ // skip this column's type code + } + t.Fatalf("column %q not found in schema", colName) +} + +func TestQwpDecoderEgressDate(t *testing.T) { + // DATE egress is framed exactly like TIMESTAMP: a 1-byte encoding + // discriminator then RAW int64 / Gorilla. The decoder must route + // DATE through parseTimestamp (regression guard for the DATE-as- + // plain-int64 bug the egress fuzz caught). Cover both branches. + run := func(t *testing.T, vals []int64) { + t.Helper() + tb := newQwpTableBuffer("t") + for _, v := range vals { + col, err := tb.getOrCreateColumn("d", qwpTypeTimestamp, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(v) + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTable(tb) + patchSchemaTypeToDate(t, ingress, "d") + frame := wrapAsResultBatch(ingress, 1, 0) + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(frame, &b); err != nil { + t.Fatalf("decode: %v", err) + } + if b.RowCount() != len(vals) { + t.Fatalf("RowCount = %d, want %d", b.RowCount(), len(vals)) + } + for i, w := range vals { + if got := b.Int64(0, i); got != w { + t.Fatalf("Int64[%d] = %d, want %d", i, got, w) + } + } + } + // <=2 values force the encoder's uncompressed (0x00) branch. + t.Run("Uncompressed", func(t *testing.T) { + run(t, []int64{0, 1_700_000_000_000}) + }) + // >2 values with small delta-of-deltas pick Gorilla (0x01). + t.Run("Gorilla", func(t *testing.T) { + run(t, []int64{1_000_000, 1_000_100, 1_000_200, 1_000_310, 1_000_520}) + }) +} + +func TestQwpDecoderRoundTripTimestampGorilla(t *testing.T) { + // >3 timestamps with small DoDs → encoder picks the Gorilla path. + values := []int64{1_000_000, 1_000_100, 1_000_200, 1_000_310, 1_000_520} + rows := make([]func(*qwpColumnBuffer), len(values)) + for i, v := range values { + v := v + rows[i] = func(c *qwpColumnBuffer) { c.addLong(v) } + } + frame := encodeSingleColumnBatch(t, "ts", qwpTypeTimestamp, false, rows) + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame, &batch); err != nil { + t.Fatalf("decode: %v", err) + } + want := []int64{1_000_000, 1_000_100, 1_000_200, 1_000_310, 1_000_520} + for i, w := range want { + if got := batch.Int64(0, i); got != w { + t.Fatalf("Int64[%d] = %d, want %d", i, got, w) + } + } +} + +func TestQwpDecoderRoundTripTimestampUncompressed(t *testing.T) { + // <= 2 timestamps force the encoder's uncompressed branch even + // with FLAG_GORILLA set. + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn("ts", qwpTypeTimestamp, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(42) + tb.commitRow() + col, _ = tb.getOrCreateColumn("ts", qwpTypeTimestamp, false) + col.addLong(43) + tb.commitRow() + var enc qwpEncoder + frame := wrapAsResultBatch(enc.encodeTable(tb), 1, 0) + + dec := newTestQueryDecoder() + var batch QwpColumnBatch + if err := dec.decode(frame, &batch); err != nil { + t.Fatalf("decode: %v", err) + } + if got := batch.Int64(0, 0); got != 42 { + t.Fatalf("Int64[0] = %d", got) + } + if got := batch.Int64(0, 1); got != 43 { + t.Fatalf("Int64[1] = %d", got) + } +} + +func TestQwpDecoderRoundTripGeohash(t *testing.T) { + for _, prec := range []int{8, 40, 60} { + t.Run("prec_"+itoa(prec), func(t *testing.T) { + tb := newQwpTableBuffer("t") + // A handful of valid geohash bit patterns. Constrain to + // the requested precision by masking to the low `prec` + // bits; higher bits aren't meaningful on the wire. + mask := uint64(1)< 0) that arrives before any + // batch_seq==0 schema batch has no schema to reuse and must be + // rejected rather than misparsed. + tb := newQwpTableBuffer("t") + col, _ := tb.getOrCreateColumn("a", qwpTypeLong, false) + col.addLong(10) + tb.commitRow() + var enc qwpEncoder + frame := wrapAsResultBatch(enc.encodeTable(tb), 1, 1) + + dec := newTestQueryDecoder() + var batch QwpColumnBatch + err := dec.decode(frame, &batch) + assertDecodeErrContains(t, err, "before its schema batch") +} + +// --- Hardening tests (ports of QwpResultBatchDecoderHardeningTest) --- + +// writeMinimalResultBatch builds a minimal valid RESULT_BATCH frame +// with 0 rows and 0 columns. Matches QwpResultBatchDecoderHardeningTest. +// writeMinimalResultBatch. +func writeMinimalResultBatch() []byte { + var buf bytes.Buffer + // Header + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) // flags + // tableCount = 1, payloadLength placeholder + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + // Body + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) // requestId + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // name_len + putVarintBytes(&buf, 0) // row_count + putVarintBytes(&buf, 0) // column_count + // Patch payloadLength at offset 8. + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +// writeMinimalResultBatchWithRawNameLenVarint injects a raw varint +// byte sequence for the table name length (the first varint after the +// batch_seq). +func writeMinimalResultBatchWithRawNameLenVarint(nameLenVarint []byte) []byte { + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) + buf.Write(nameLenVarint) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +// writeStringResultBatchCustom builds a RESULT_BATCH with one VARCHAR +// column, len(offsets)-1 non-null rows, and the provided offsets / +// payload stamped verbatim into the frame. Used by the offset-validation +// hardening subtests. +func writeStringResultBatchCustom(offsets []uint32, payload []byte) []byte { + nonNull := len(offsets) - 1 + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(7)) + putVarintBytes(&buf, 0) + putVarintBytes(&buf, 0) + putVarintBytes(&buf, uint64(nonNull)) + putVarintBytes(&buf, 1) + putVarintBytes(&buf, 1) + buf.WriteByte('s') + buf.WriteByte(byte(qwpTypeVarchar)) + buf.WriteByte(0) + for _, off := range offsets { + _ = binary.Write(&buf, binary.LittleEndian, off) + } + buf.Write(payload) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +// writeStringResultBatch builds a RESULT_BATCH with one VARCHAR column, +// nonNull rows, and the given totalBytes value stamped into +// offsets[nonNull]. Used by the negative-totalBytes regression. +func writeStringResultBatch(nonNull int, totalBytes int32) []byte { + var buf bytes.Buffer + // Header + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + // Body + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(7)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, uint64(nonNull)) // row_count + putVarintBytes(&buf, 1) // column_count + // Schema: column "s" : VARCHAR (egress may send STRING 0x08 but + // the encoder-side tests use VARCHAR so the shared offsets+bytes + // layout is exercised). + putVarintBytes(&buf, 1) + buf.WriteByte('s') + buf.WriteByte(byte(qwpTypeVarchar)) + // Column body: null_flag = 0 (no nulls). + buf.WriteByte(0) + // Offsets: nonNull zeros, then totalBytes. + for i := 0; i < nonNull; i++ { + _ = binary.Write(&buf, binary.LittleEndian, uint32(i*5)) + } + _ = binary.Write(&buf, binary.LittleEndian, uint32(totalBytes)) + // 5 bytes "hello" for the success case; the rejection case must + // error before reading these. + buf.WriteString("hello") + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +func putVarintBytes(buf *bytes.Buffer, v uint64) { + tmp := make([]byte, qwpMaxVarintLen) + n := qwpPutVarint(tmp, v) + buf.Write(tmp[:n]) +} + +// itoa: base-10 int → string, without pulling in strconv at package +// level for test-only use. +func itoa(n int) string { + if n == 0 { + return "0" + } + neg := n < 0 + if neg { + n = -n + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + if neg { + i-- + buf[i] = '-' + } + return string(buf[i:]) +} + +func TestQwpDecoderHardening(t *testing.T) { + t.Run("H1_PayloadTooShort", func(t *testing.T) { + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(make([]byte, 5), &b) + assertDecodeErrContains(t, err, "too short") + }) + + t.Run("H1a_BatchExceedsWireCap", func(t *testing.T) { + // Spec §14: RESULT_BATCH wire size is capped at 16 MiB. A + // conformant server stays under; a hostile / buggy server that + // goes over must be rejected up front, before any header, + // schema, or body bound is exercised. The frame contents do + // not need to be valid — the cap fires first. + payload := make([]byte, qwpMaxBatchSize+1) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(payload, &b) + assertDecodeErrContains(t, err, "exceeds protocol cap") + }) + + t.Run("H1b_BatchAtWireCapAccepted", func(t *testing.T) { + // A frame whose total wire size equals the 16 MiB cap exactly + // must pass the size guard and continue into the regular + // parse. We use a minimal valid frame padded out to the cap + // via a long table name (still inside the per-table-name + // limits enforced downstream — here the parse fails for an + // unrelated reason: name_len > qwpMaxTableNameLen). The point + // of this test is only to pin that the size guard does NOT + // reject a frame at exactly qwpMaxBatchSize bytes. + buf := writeMinimalResultBatch() + // Pad with arbitrary trailing bytes so len(buf) == qwpMaxBatchSize. + // The decoder rejects on a downstream check (specifically the + // table-name-length cap or end-of-frame mismatch), not on the + // size guard, which is what this test asserts. + pad := qwpMaxBatchSize - len(buf) + if pad < 0 { + t.Fatalf("minimal frame already exceeds cap (%d > %d)", len(buf), qwpMaxBatchSize) + } + buf = append(buf, make([]byte, pad)...) + if len(buf) != qwpMaxBatchSize { + t.Fatalf("padded frame has %d bytes, want %d", len(buf), qwpMaxBatchSize) + } + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + // Any non-size error is fine; the test fails only if the size + // guard incorrectly rejects a frame at the cap. + if err != nil && strings.Contains(err.Error(), "exceeds protocol cap") { + t.Fatalf("frame at cap rejected by size guard: %v", err) + } + }) + + t.Run("H2_BadMagic", func(t *testing.T) { + buf := writeMinimalResultBatch() + buf[0] = 0xFF + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "bad magic") + }) + + t.Run("H3_VersionMismatch", func(t *testing.T) { + // Spec §3 requires strict equality between the frame's header + // version byte and the negotiated version. The default test + // decoder is pinned to qwpVersion (= 1); any other value must + // be rejected — including a value within the supported range + // (0x02), not just 0xFF. + for _, v := range []byte{0x02, 0xFF} { + buf := writeMinimalResultBatch() + buf[4] = v + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "does not match negotiated version") + } + }) + + t.Run("H3a_PayloadLengthMismatch", func(t *testing.T) { + // parseFrameHeader validates the header's declared + // payload_length against the body it actually received. A frame + // whose declared length disagrees with its size is a framing + // desync and must be rejected up front, not decoded. + correct := uint32(len(writeMinimalResultBatch()) - qwpHeaderSize) + for _, declared := range []uint32{correct + 1, correct - 1, 0} { + buf := writeMinimalResultBatch() + binary.LittleEndian.PutUint32(buf[qwpHeaderOffsetPayloadLen:], declared) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "does not match body size") + } + }) + + t.Run("H4_UnexpectedMsgKind", func(t *testing.T) { + // Use a frame whose table_count matches the spoofed msg_kind so + // the per-kind RESULT_BATCH check is what fires (not the + // table_count guard that runs first inside parseFrameHeader). + // RESULT_END expects table_count=0; matches the value that + // writeQwpFrame sets. + buf := writeQwpFrame(0, buildResultEndBody(1, 0, 0)) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "expected RESULT_BATCH") + }) + + t.Run("H4a_TableCountWrongOnResultBatch", func(t *testing.T) { + // Spec §4: RESULT_BATCH must carry table_count = 1. A + // conformant decoder must reject any other value rather than + // treat it as a hint. writeMinimalResultBatch sets the field + // to 1; flip it to 0 and 5 to cover both directions. + for _, tc := range []uint16{0, 5} { + buf := writeMinimalResultBatch() + binary.LittleEndian.PutUint16( + buf[qwpHeaderOffsetTableCount:qwpHeaderOffsetTableCount+2], tc) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "table_count") + } + }) + + t.Run("H4b_TableCountNonZeroOnResultEnd", func(t *testing.T) { + // Spec §4 / §8: RESULT_END must carry table_count = 0. + frame := writeQwpFrame(0, buildResultEndBody(1, 0, 0)) + binary.LittleEndian.PutUint16( + frame[qwpHeaderOffsetTableCount:qwpHeaderOffsetTableCount+2], 1) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "table_count") + }) + + t.Run("H4c_TableCountNonZeroOnQueryError", func(t *testing.T) { + // Spec §4 / §9: QUERY_ERROR must carry table_count = 0. + frame := writeQwpFrame(0, buildQueryErrorBody(1, byte(QwpStatusParseError), "bad", -1)) + binary.LittleEndian.PutUint16( + frame[qwpHeaderOffsetTableCount:qwpHeaderOffsetTableCount+2], 1) + dec := newTestQueryDecoder() + _, err := dec.decodeQueryError(frame) + assertDecodeErrContains(t, err, "table_count") + }) + + t.Run("H4d_TableCountNonZeroOnExecDone", func(t *testing.T) { + // Spec §4 / §11.6: EXEC_DONE must carry table_count = 0. + frame := writeQwpFrame(0, buildExecDoneBody(1, 0, 0)) + binary.LittleEndian.PutUint16( + frame[qwpHeaderOffsetTableCount:qwpHeaderOffsetTableCount+2], 1) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + assertDecodeErrContains(t, err, "table_count") + }) + + t.Run("H4e_TableCountNonZeroOnCacheReset", func(t *testing.T) { + // Spec §4 / §11.7: CACHE_RESET must carry table_count = 0. + frame := writeQwpFrame(0, buildCacheResetBody(qwpResetMaskDict)) + binary.LittleEndian.PutUint16( + frame[qwpHeaderOffsetTableCount:qwpHeaderOffsetTableCount+2], 1) + dec := newTestQueryDecoder() + _, err := dec.decodeCacheReset(frame) + assertDecodeErrContains(t, err, "table_count") + }) + + t.Run("H6_TableNameLengthOverflowVarint", func(t *testing.T) { + // 10-byte varint with bit 63 set on byte 10. + buf := writeMinimalResultBatchWithRawNameLenVarint([]byte{ + 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x01, + }) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + // Both phrasings acceptable — we fail at varintInt63 with + // "exceeds int63" or at the table-name cap with "out of + // range". + if err == nil || (!containsAny(err.Error(), []string{"int63", "table name length", "out of range"})) { + t.Fatalf("unexpected error: %v", err) + } + }) + + t.Run("H7_RowCountOutOfRange", func(t *testing.T) { + // Craft a frame whose row_count exceeds qwpMaxRowsPerBatch. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // name_len + putVarintBytes(&buf, uint64(qwpMaxRowsPerBatch+1)) + putVarintBytes(&buf, 0) + buf.WriteByte(0) + putVarintBytes(&buf, 0) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "row_count") + }) + + t.Run("H7a_CellCountAmplificationRejected", func(t *testing.T) { + // M3 regression. An all-null column is nearly free on the wire (a + // rowCount/8 null bitmap, zstd-compressible to almost nothing) yet + // forces a rowCount-sized index array. row_count and column_count + // are each individually within their caps here, but their product + // overruns qwpMaxCellsPerBatch — a frame that, if decoded, would + // drive a multi-GiB transient allocation. The decoder must reject + // it up front, before the per-column loop sizes any index array. + // + // The frame carries the inline schema for every column but NO + // column data: a decoder that skipped the cell-count guard would + // fault later reading the first column's null section off the end + // of the buffer, never with this "cell count" error. + const rowCount = qwpMaxRowsPerBatch + columnCount := int(qwpMaxCellsPerBatch/rowCount) + 1 + if columnCount > qwpMaxColumnsPerTable { + t.Fatalf("test setup: columnCount %d exceeds the column cap", columnCount) + } + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, uint64(rowCount)) // row_count + putVarintBytes(&buf, uint64(columnCount)) // column_count + // Inline schema: one tiny LONG column def each (1-byte name + + // type code). No column data follows. + for i := 0; i < columnCount; i++ { + putVarintBytes(&buf, 1) + buf.WriteByte('c') + buf.WriteByte(byte(qwpTypeLong)) + } + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "cell count") + }) + + t.Run("H7b_CellCountAtCapNotRejectedByGuard", func(t *testing.T) { + // Boundary: a batch whose cell count is exactly at the cap clears + // the guard. The frame again carries no column data, so decoding + // fails while reading the first column — proving the guard did NOT + // fire (it would have produced a "cell count" error instead) and + // that a maximal conformant batch is not rejected. + const rowCount = qwpMaxRowsPerBatch + columnCount := int(qwpMaxCellsPerBatch / rowCount) // exactly at the cap + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, uint64(rowCount)) // row_count + putVarintBytes(&buf, uint64(columnCount)) // column_count + for i := 0; i < columnCount; i++ { + putVarintBytes(&buf, 1) + buf.WriteByte('c') + buf.WriteByte(byte(qwpTypeLong)) + } + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + if err == nil { + t.Fatal("expected a truncation error reading the first column, got nil") + } + if containsAny(err.Error(), []string{"cell count"}) { + t.Fatalf("cell-count guard fired at the cap boundary: %v", err) + } + }) + + t.Run("H16_StringNegativeTotalBytes", func(t *testing.T) { + buf := writeStringResultBatch(1, -1) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "total bytes") + }) + + t.Run("H17_StringValidTotalBytesAccepted", func(t *testing.T) { + buf := writeStringResultBatch(1, 5) + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(buf, &b); err != nil { + t.Fatalf("valid totalBytes rejected: %v", err) + } + if got := b.String(0, 0); got != "hello" { + t.Fatalf("String = %q, want hello", got) + } + }) + + t.Run("H17a_StringOffsetsNotMonotonic", func(t *testing.T) { + // Row 0 spans [0, 8), row 1 spans [8, 5) — slicing would + // panic in qwpStringSlice. + buf := writeStringResultBatchCustom([]uint32{0, 8, 5}, []byte("helloworld")) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "offset at index") + }) + + t.Run("H17b_StringOffsetExceedsTotalBytes", func(t *testing.T) { + // Row 0 claims to run to offset 11 but totalBytes = 10 — + // the final slice is length 10, so end=11 would panic. + buf := writeStringResultBatchCustom([]uint32{0, 11, 10}, []byte("0123456789")) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "offset at index") + }) + + t.Run("H17c_StringFirstOffsetNotZero", func(t *testing.T) { + buf := writeStringResultBatchCustom([]uint32{3, 5}, []byte("hello")) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "first offset") + }) + + t.Run("H25_UnsupportedWireTypeString", func(t *testing.T) { + // Build a minimal frame that declares one column of type + // 0x08 (old STRING; this client does not support it). + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // name_len + putVarintBytes(&buf, 1) // row_count = 1 + putVarintBytes(&buf, 1) // col_count = 1 + putVarintBytes(&buf, 1) + buf.WriteByte('s') + buf.WriteByte(0x08) // STRING — unsupported + buf.WriteByte(0) // null flag = 0 + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "unsupported wire type") + }) + + t.Run("H26_ZstdFlagOnGarbageRejected", func(t *testing.T) { + // FLAG_ZSTD set but the body after the prelude is plain + // (uncompressed) bytes — the zstd frame-header parser rejects + // as "invalid zstd frame header". Same guarantee as the old + // "not yet supported" check: a malformed or mis-flagged batch + // cannot sneak past the decoder. + buf := writeMinimalResultBatch() + buf[qwpHeaderOffsetFlags] |= qwpFlagZstd + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err := dec.decode(buf, &b) + assertDecodeErrContains(t, err, "invalid zstd frame header") + }) + + t.Run("H18_DeltaDictOutOfSync", func(t *testing.T) { + // Hand-build a frame with FLAG_DELTA_SYMBOL_DICT and a + // delta_start that doesn't match the (empty) decoder dict. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(qwpFlagDeltaSymbolDict) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + // Delta dict: delta_start = 3 (should be 0), count = 0. + putVarintBytes(&buf, 3) + putVarintBytes(&buf, 0) + // Minimal table block (0 rows, 0 cols). + putVarintBytes(&buf, 0) + putVarintBytes(&buf, 0) + putVarintBytes(&buf, 0) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "out of sync") + }) + + t.Run("H23_GorillaNonNullLessThanThree", func(t *testing.T) { + // Build a frame with FLAG_GORILLA, one TIMESTAMP column, + // nonNull=2, encoding byte 0x01 (Gorilla). Expect rejection. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(qwpFlagGorilla) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) + putVarintBytes(&buf, 0) + putVarintBytes(&buf, 2) // row_count = 2 + putVarintBytes(&buf, 1) + putVarintBytes(&buf, 1) + buf.WriteByte('t') + buf.WriteByte(byte(qwpTypeTimestamp)) + buf.WriteByte(0) // null flag = 0 + buf.WriteByte(qwpTsEncodingGorilla) // 0x01 + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(2)) + // No bitstream follows — nonNull=2, so Gorilla shouldn't be + // in use. Decoder must reject before reading bitstream. + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "nonNull<3") + }) + + t.Run("H8_ColumnCountOutOfRange", func(t *testing.T) { + // col_count > qwpMaxColumnsPerTable must be rejected before the + // decoder allocates per-column layouts. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, 0) // row_count + putVarintBytes(&buf, uint64(qwpMaxColumnsPerTable)+1) // col_count + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "column_count") + }) + + t.Run("H9_TableNameLengthCap", func(t *testing.T) { + // table_name_len = qwpMaxTableNameLen + 1 is a valid varint but + // exceeds the cap. The decoder must reject before slicing. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, uint64(qwpMaxTableNameLen)+1) // name_len + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "table name length") + }) + + t.Run("H10_ColumnNameLengthCap", func(t *testing.T) { + // Full schema with a single column whose name length exceeds + // qwpMaxColumnNameLen. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, 0) // row_count + putVarintBytes(&buf, 1) // col_count = 1 + putVarintBytes(&buf, uint64(qwpMaxColumnNameLen)+1) // col name_len + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "column name length") + }) + + t.Run("H19_DeltaDictRangeOverflow", func(t *testing.T) { + // delta_start + delta_count must stay inside uint32; the Java + // reference decoder and this one both reject the overflow case + // before doing any per-entry reads. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(qwpFlagDeltaSymbolDict) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // delta_start + putVarintBytes(&buf, uint64(1)<<32) // delta_count = 2^32 (overflows uint32) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "delta symbol section") + }) + + t.Run("H27_ArrayNegativeDim", func(t *testing.T) { + // DOUBLE_ARRAY column, row_count=1, non-null, nDims=1, + // shape[0] = -1 (as int32). The decoder must reject. + frame := buildArrayHardeningFrame(t, 1, []int32{-1}) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "ARRAY dim") + }) + + t.Run("H27b_ArrayZeroDim", func(t *testing.T) { + // shape[0] = 0. A zero-extent dimension would zero out the + // element count and short-circuit the qwpMaxArrayElements cap + // for any remaining dimensions. The encoder never emits dl == 0; + // the decoder must reject it (matches Java's dl < 1 guard). + frame := buildArrayHardeningFrame(t, 1, []int32{0}) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "ARRAY dim") + }) + + t.Run("H28_ArrayElementCountExceeded", func(t *testing.T) { + // Two dims whose product overflows qwpMaxArrayElements. + big := int32(1<<20 + 1) + frame := buildArrayHardeningFrame(t, 2, []int32{big, big}) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "element count") + }) + + t.Run("H29_ArrayNDimsOutOfRange", func(t *testing.T) { + // nDims > qwpMaxArrayNDims is still rejected. + frame := buildArrayHardeningFrame(t, qwpMaxArrayNDims+1, nil) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "ARRAY nDims") + }) + + t.Run("H29b_ArrayNDimsZeroRejected", func(t *testing.T) { + // The server always encodes NULL arrays via the null bitmap, so + // an inline nDims=0 on a row the bitmap marked non-null is a + // malformed frame. The decoder must reject it. + frame := buildArrayHardeningFrame(t, 0, nil) + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "ARRAY nDims") + }) + + t.Run("H30_GeohashPrecisionOutOfRange", func(t *testing.T) { + // GEOHASH column, row_count=0 so no data follows, but the + // precision varint is read up front and must be <= 60. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, 0) // row_count + putVarintBytes(&buf, 1) // col_count + putVarintBytes(&buf, 1) // col name_len + buf.WriteByte('g') + buf.WriteByte(byte(qwpTypeGeohash)) + buf.WriteByte(0) // null flag + putVarintBytes(&buf, 61) // precision > 60 + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "geohash precision") + }) + + t.Run("H30b_GeohashPrecisionZero", func(t *testing.T) { + // Lower bound: precision must be >= 1. The server enforces + // [1, 60] on GEOLONG precision; a zero would drive + // bytesPerValue = 0 into the length calculation. Mirror Java's + // varintValue < 1 guard. + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, 0) // row_count + putVarintBytes(&buf, 1) // col_count + putVarintBytes(&buf, 1) // col name_len + buf.WriteByte('g') + buf.WriteByte(byte(qwpTypeGeohash)) + buf.WriteByte(0) // null flag + putVarintBytes(&buf, 0) // precision < 1 + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "geohash precision") + }) +} + +// buildArrayHardeningFrame crafts a minimal RESULT_BATCH carrying a +// single DOUBLE_ARRAY column with one non-null row whose per-row entry +// is (nDims byte, shape int32s, then as many float64 elements as the +// shape's product). This is enough to exercise the array-section +// hardening branches. +func buildArrayHardeningFrame(t *testing.T, nDims int, shape []int32) []byte { + t.Helper() + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(0) + _ = binary.Write(&buf, binary.LittleEndian, uint16(1)) + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) + buf.WriteByte(byte(qwpMsgKindResultBatch)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(1)) + putVarintBytes(&buf, 0) // batch_seq + putVarintBytes(&buf, 0) // table_name_len + putVarintBytes(&buf, 1) // row_count = 1 + putVarintBytes(&buf, 1) // col_count = 1 + putVarintBytes(&buf, 1) + buf.WriteByte('a') + buf.WriteByte(byte(qwpTypeDoubleArray)) + buf.WriteByte(0) // null flag + // Row body. + buf.WriteByte(byte(nDims)) + for _, d := range shape { + _ = binary.Write(&buf, binary.LittleEndian, d) + } + // The decoder rejects on the shape/nDims check before reading any + // element bytes, so we don't need to append them for those paths. + // Append zero padding just to avoid a truncated-frame error + // masking the real one. + buf.Write(make([]byte, 8)) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], uint32(len(out)-qwpHeaderSize)) + return out +} + +// writeQwpFrame builds a complete QWP frame: a 12-byte header with the +// given flags plus the supplied body bytes. The body must start with the +// msg_kind byte. payload_length is patched in; table_count is written +// as 0, which spec §4 mandates for every non-RESULT_BATCH kind. +func writeQwpFrame(flags byte, body []byte) []byte { + var buf bytes.Buffer + _ = binary.Write(&buf, binary.LittleEndian, qwpMagic) + buf.WriteByte(qwpVersion) + buf.WriteByte(flags) + _ = binary.Write(&buf, binary.LittleEndian, uint16(0)) // table_count + _ = binary.Write(&buf, binary.LittleEndian, uint32(0)) // payload_length placeholder + buf.Write(body) + out := buf.Bytes() + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], + uint32(len(out)-qwpHeaderSize)) + return out +} + +// buildResultEndBody assembles a RESULT_END body given requestId, +// finalSeq, and totalRows. Returns the msg_kind byte followed by the +// fixed and varint fields (no header). +func buildResultEndBody(requestId int64, finalSeq uint64, totalRows uint64) []byte { + var buf bytes.Buffer + buf.WriteByte(byte(qwpMsgKindResultEnd)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(requestId)) + putVarintBytes(&buf, finalSeq) + putVarintBytes(&buf, totalRows) + return buf.Bytes() +} + +// buildQueryErrorBody assembles a QUERY_ERROR body. rawMsgLen overrides +// the msg_len field on the wire (used to inject hostile values); pass -1 +// to fall back to len(msg). +func buildQueryErrorBody(requestId int64, status byte, msg string, rawMsgLen int) []byte { + var buf bytes.Buffer + buf.WriteByte(byte(qwpMsgKindQueryError)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(requestId)) + buf.WriteByte(status) + msgLen := uint16(len(msg)) + if rawMsgLen >= 0 { + msgLen = uint16(rawMsgLen) + } + _ = binary.Write(&buf, binary.LittleEndian, msgLen) + buf.WriteString(msg) + return buf.Bytes() +} + +// buildExecDoneBody assembles an EXEC_DONE body. +func buildExecDoneBody(requestId int64, opType byte, rowsAffected uint64) []byte { + var buf bytes.Buffer + buf.WriteByte(byte(qwpMsgKindExecDone)) + _ = binary.Write(&buf, binary.LittleEndian, uint64(requestId)) + buf.WriteByte(opType) + putVarintBytes(&buf, rowsAffected) + return buf.Bytes() +} + +func TestQwpDecoderResultEnd(t *testing.T) { + t.Run("RoundTrip", func(t *testing.T) { + frame := writeQwpFrame(0, buildResultEndBody(42, 7, 1234)) + dec := newTestQueryDecoder() + reqId, total, err := dec.decodeResultEnd(frame) + if err != nil { + t.Fatalf("decodeResultEnd: %v", err) + } + if reqId != 42 { + t.Fatalf("requestId = %d, want 42", reqId) + } + if total != 1234 { + t.Fatalf("totalRows = %d, want 1234", total) + } + }) + + t.Run("ZeroRows", func(t *testing.T) { + frame := writeQwpFrame(0, buildResultEndBody(1, 0, 0)) + dec := newTestQueryDecoder() + _, total, err := dec.decodeResultEnd(frame) + if err != nil { + t.Fatalf("decodeResultEnd: %v", err) + } + if total != 0 { + t.Fatalf("totalRows = %d, want 0", total) + } + }) + + t.Run("WrongMsgKind", func(t *testing.T) { + body := buildResultEndBody(1, 0, 0) + body[0] = byte(qwpMsgKindExecDone) + frame := writeQwpFrame(0, body) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "expected RESULT_END") + }) + + t.Run("TruncatedBeforeRequestId", func(t *testing.T) { + // Header + msg_kind only. + frame := writeQwpFrame(0, []byte{byte(qwpMsgKindResultEnd)}) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "end of buffer") + }) + + t.Run("TruncatedBeforeFinalSeq", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindResultEnd)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "truncated") + }) + + t.Run("TotalRowsVarintOverflow", func(t *testing.T) { + // 10 bytes with continuation bit through byte 9 and a value + // bit past bit 63 — rejects at readVarint's overflow guard. + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindResultEnd)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + putVarintBytes(&body, 0) // final_seq = 0 + body.Write([]byte{ + 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, + }) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + if err == nil { + t.Fatal("expected varint overflow error, got nil") + } + }) + + t.Run("BadMagic", func(t *testing.T) { + frame := writeQwpFrame(0, buildResultEndBody(1, 0, 0)) + frame[0] = 0xFF + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "bad magic") + }) + + t.Run("ZstdFlagRejected", func(t *testing.T) { + // FLAG_ZSTD is only valid on RESULT_BATCH; carrying it on a + // RESULT_END frame is a protocol violation that the decoder + // catches at the top of decodeResultEnd. + frame := writeQwpFrame(qwpFlagZstd, buildResultEndBody(1, 0, 0)) + dec := newTestQueryDecoder() + _, _, err := dec.decodeResultEnd(frame) + assertDecodeErrContains(t, err, "FLAG_ZSTD set on non-RESULT_BATCH") + }) +} + +func TestQwpDecoderQueryError(t *testing.T) { + // Port of Java QwpResultBatchDecoderHardeningTest.testQueryErrorValidMessageDecodes. + t.Run("ValidMessageDecodes", func(t *testing.T) { + frame := writeQwpFrame(0, buildQueryErrorBody(99, 0x05, "boom", -1)) + dec := newTestQueryDecoder() + qe, err := dec.decodeQueryError(frame) + if err != nil { + t.Fatalf("decodeQueryError: %v", err) + } + if qe.RequestId != 99 { + t.Fatalf("RequestId = %d, want 99", qe.RequestId) + } + if qe.Status != QwpStatusCode(0x05) { + t.Fatalf("Status = 0x%02X, want 0x05", byte(qe.Status)) + } + if qe.Message != "boom" { + t.Fatalf("Message = %q, want %q", qe.Message, "boom") + } + }) + + // Port of Java testQueryErrorMsgLenOverrunIsRejected: msgLen claims + // 0xFFFF but the frame has no bytes of message. + t.Run("MsgLenOverrunRejected", func(t *testing.T) { + frame := writeQwpFrame(0, buildQueryErrorBody(0, 0, "", 0xFFFF)) + dec := newTestQueryDecoder() + _, err := dec.decodeQueryError(frame) + assertDecodeErrContains(t, err, "msg_len") + if !strings.Contains(err.Error(), "exceeds") { + t.Fatalf("expected 'exceeds' in error, got: %v", err) + } + }) + + t.Run("EmptyMessage", func(t *testing.T) { + frame := writeQwpFrame(0, buildQueryErrorBody(1, byte(qwpStatusCancelled), "", -1)) + dec := newTestQueryDecoder() + qe, err := dec.decodeQueryError(frame) + if err != nil { + t.Fatalf("decodeQueryError: %v", err) + } + if qe.Status != qwpStatusCancelled { + t.Fatalf("Status = 0x%02X, want CANCELLED", byte(qe.Status)) + } + if qe.Message != "" { + t.Fatalf("Message = %q, want empty", qe.Message) + } + }) + + t.Run("CancelledStatusSurfaces", func(t *testing.T) { + frame := writeQwpFrame(0, buildQueryErrorBody(1, byte(qwpStatusCancelled), + "query cancelled", -1)) + dec := newTestQueryDecoder() + qe, err := dec.decodeQueryError(frame) + if err != nil { + t.Fatalf("decodeQueryError: %v", err) + } + // Error() must mention CANCELLED and the message. + if s := qe.Error(); !strings.Contains(s, "CANCELLED") || + !strings.Contains(s, "query cancelled") { + t.Fatalf("Error() = %q, missing status name or message", s) + } + }) + + t.Run("LimitExceededStatusSurfaces", func(t *testing.T) { + frame := writeQwpFrame(0, buildQueryErrorBody(1, byte(qwpStatusLimitExceeded), + "rows cap hit", -1)) + dec := newTestQueryDecoder() + qe, err := dec.decodeQueryError(frame) + if err != nil { + t.Fatalf("decodeQueryError: %v", err) + } + if qe.Status != qwpStatusLimitExceeded { + t.Fatalf("Status = 0x%02X, want LIMIT_EXCEEDED", byte(qe.Status)) + } + }) + + t.Run("WrongMsgKind", func(t *testing.T) { + // Use a non-RESULT_BATCH stand-in (RESULT_END, table_count=0) + // so the per-kind QUERY_ERROR check is what fires, not the + // table_count guard inside parseFrameHeader. + body := buildQueryErrorBody(1, 0x05, "x", -1) + body[0] = byte(qwpMsgKindResultEnd) + frame := writeQwpFrame(0, body) + dec := newTestQueryDecoder() + _, err := dec.decodeQueryError(frame) + assertDecodeErrContains(t, err, "expected QUERY_ERROR") + }) + + t.Run("TruncatedBeforeStatus", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindQueryError)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, err := dec.decodeQueryError(frame) + assertDecodeErrContains(t, err, "end of buffer") + }) + + t.Run("TruncatedBeforeMsgLen", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindQueryError)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + body.WriteByte(0x05) + // Only 1 byte after status — msg_len needs 2. + body.WriteByte(0x00) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, err := dec.decodeQueryError(frame) + assertDecodeErrContains(t, err, "end of buffer") + }) + + t.Run("UnicodeMessage", func(t *testing.T) { + msg := "ünïcødé ⚠" + frame := writeQwpFrame(0, buildQueryErrorBody(1, 0x06, msg, -1)) + dec := newTestQueryDecoder() + qe, err := dec.decodeQueryError(frame) + if err != nil { + t.Fatalf("decodeQueryError: %v", err) + } + if qe.Message != msg { + t.Fatalf("Message = %q, want %q", qe.Message, msg) + } + }) +} + +func TestQwpDecoderExecDone(t *testing.T) { + t.Run("RoundTrip", func(t *testing.T) { + frame := writeQwpFrame(0, buildExecDoneBody(100, 0x04, 42)) + dec := newTestQueryDecoder() + reqId, res, err := dec.decodeExecDone(frame) + if err != nil { + t.Fatalf("decodeExecDone: %v", err) + } + if reqId != 100 { + t.Fatalf("requestId = %d, want 100", reqId) + } + if res.OpType != 0x04 { + t.Fatalf("OpType = 0x%02X, want 0x04", res.OpType) + } + if res.RowsAffected != 42 { + t.Fatalf("RowsAffected = %d, want 42", res.RowsAffected) + } + }) + + t.Run("PureDDLZeroRows", func(t *testing.T) { + frame := writeQwpFrame(0, buildExecDoneBody(1, 0x01, 0)) + dec := newTestQueryDecoder() + _, res, err := dec.decodeExecDone(frame) + if err != nil { + t.Fatalf("decodeExecDone: %v", err) + } + if res.RowsAffected != 0 { + t.Fatalf("RowsAffected = %d, want 0", res.RowsAffected) + } + }) + + t.Run("WrongMsgKind", func(t *testing.T) { + body := buildExecDoneBody(1, 0x01, 0) + body[0] = byte(qwpMsgKindQueryError) + frame := writeQwpFrame(0, body) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + assertDecodeErrContains(t, err, "expected EXEC_DONE") + }) + + t.Run("TruncatedBeforeOpType", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindExecDone)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + assertDecodeErrContains(t, err, "end of buffer") + }) + + t.Run("TruncatedBeforeRowsAffected", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindExecDone)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + body.WriteByte(0x04) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + assertDecodeErrContains(t, err, "truncated") + }) + + t.Run("RowsAffectedVarintOverflow", func(t *testing.T) { + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindExecDone)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + body.WriteByte(0x04) + body.Write([]byte{ + 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, + }) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + if err == nil { + t.Fatal("expected varint overflow error, got nil") + } + }) + + t.Run("RowsAffectedInt63Overflow", func(t *testing.T) { + // 10-byte varint encoding exactly 2^63 — a valid uint64 but + // readVarintInt63 rejects because the int64 cast sign-flips. + // 9 continuation bytes of zero, then 0x01 (bit 63). + var body bytes.Buffer + body.WriteByte(byte(qwpMsgKindExecDone)) + _ = binary.Write(&body, binary.LittleEndian, uint64(1)) + body.WriteByte(0x04) + body.Write([]byte{ + 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x01, + }) + frame := writeQwpFrame(0, body.Bytes()) + dec := newTestQueryDecoder() + _, _, err := dec.decodeExecDone(frame) + assertDecodeErrContains(t, err, "int63") + }) +} + +// buildCacheResetBody assembles a CACHE_RESET body: msg_kind + 1-byte +// reset_mask. Returned bytes are ready to drop into writeQwpFrame. +func buildCacheResetBody(mask byte) []byte { + return []byte{byte(qwpMsgKindCacheReset), mask} +} + +func TestQwpDecoderCacheReset(t *testing.T) { + t.Run("RoundTripMaskValues", func(t *testing.T) { + // The defined dict bit, a reserved bit (0x02, formerly the + // schemas bit), their combination, and the zero reset. The + // decoder surfaces the byte verbatim — the I/O layer is what + // maps bits to cache clears. + for _, mask := range []byte{ + 0x00, + qwpResetMaskDict, + 0x02, + qwpResetMaskDict | 0x02, + } { + frame := writeQwpFrame(0, buildCacheResetBody(mask)) + dec := newTestQueryDecoder() + got, err := dec.decodeCacheReset(frame) + if err != nil { + t.Fatalf("mask=0x%02X: decodeCacheReset: %v", mask, err) + } + if got != mask { + t.Fatalf("mask=0x%02X: got 0x%02X", mask, got) + } + } + }) + + t.Run("UnknownMaskBitsPreserved", func(t *testing.T) { + // The decoder must not filter unknown bits — a future server + // extension may introduce new bits, and rejecting them would + // make forward compatibility impossible. Caller (applyCacheReset) + // ignores bits it does not recognise; decode preserves them. + frame := writeQwpFrame(0, buildCacheResetBody(0xFF)) + dec := newTestQueryDecoder() + got, err := dec.decodeCacheReset(frame) + if err != nil { + t.Fatalf("decodeCacheReset: %v", err) + } + if got != 0xFF { + t.Fatalf("mask=0x%02X, want 0xFF", got) + } + }) + + t.Run("WrongMsgKind", func(t *testing.T) { + body := buildCacheResetBody(qwpResetMaskDict) + body[0] = byte(qwpMsgKindResultEnd) + frame := writeQwpFrame(0, body) + dec := newTestQueryDecoder() + _, err := dec.decodeCacheReset(frame) + assertDecodeErrContains(t, err, "expected CACHE_RESET") + }) + + t.Run("TruncatedBeforeMask", func(t *testing.T) { + // Header + msg_kind only, reset_mask missing. Java mirrors this + // with "CACHE_RESET frame truncated before reset_mask". + frame := writeQwpFrame(0, []byte{byte(qwpMsgKindCacheReset)}) + dec := newTestQueryDecoder() + _, err := dec.decodeCacheReset(frame) + assertDecodeErrContains(t, err, "truncated before reset_mask") + }) + + t.Run("BadMagic", func(t *testing.T) { + frame := writeQwpFrame(0, buildCacheResetBody(qwpResetMaskDict)) + frame[0] = 0xFF + dec := newTestQueryDecoder() + _, err := dec.decodeCacheReset(frame) + assertDecodeErrContains(t, err, "bad magic") + }) + + t.Run("ZstdFlagRejected", func(t *testing.T) { + // CACHE_RESET is a 2-byte control frame; FLAG_ZSTD is only + // valid on RESULT_BATCH. Match the other non-RESULT_BATCH + // decoder guards. + frame := writeQwpFrame(qwpFlagZstd, buildCacheResetBody(qwpResetMaskDict)) + dec := newTestQueryDecoder() + _, err := dec.decodeCacheReset(frame) + assertDecodeErrContains(t, err, "FLAG_ZSTD set on non-RESULT_BATCH") + }) +} + +func TestQwpDecoderApplyCacheReset(t *testing.T) { + // Decode a frame that populates the connection dict (delta with + // three symbols), then exercise applyCacheReset with each mask and + // assert the dict is cleared only when the dict bit is set. The + // schema is per-query (reset at query start), not a connection + // cache, so CACHE_RESET no longer touches it. + seedDecoder := func() qwpQueryDecoder { + globalDict := []string{"AAPL", "MSFT", "GOOG"} + tb := newQwpTableBuffer("t") + for _, id := range []int32{0, 1, 2} { + col, _ := tb.getOrCreateColumn("s", qwpTypeSymbol, false) + col.addSymbolID(id) + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTableWithDeltaDict(tb, globalDict, -1, 2) + frame := wrapAsResultBatch(ingress, 1, 0) + dec := newTestQueryDecoder() + var b QwpColumnBatch + if err := dec.decode(frame, &b); err != nil { + t.Fatalf("seed decode: %v", err) + } + if dec.dict.size() != 3 { + t.Fatalf("seed dict size = %d, want 3", dec.dict.size()) + } + return dec + } + + t.Run("MaskZeroIsNoOp", func(t *testing.T) { + dec := seedDecoder() + dec.applyCacheReset(0) + if dec.dict.size() != 3 { + t.Errorf("dict mutated by zero mask: size=%d", dec.dict.size()) + } + }) + + t.Run("DictBitClearsDict", func(t *testing.T) { + dec := seedDecoder() + dec.applyCacheReset(qwpResetMaskDict) + if dec.dict.size() != 0 { + t.Errorf("dict not cleared: size=%d", dec.dict.size()) + } + }) + + t.Run("UnknownBitsIgnored", func(t *testing.T) { + // 0xF0 touches none of the defined reset bits — the dict must be + // preserved for forward compat. 0x02 (formerly the schemas bit) + // is now reserved and likewise clears nothing. + dec := seedDecoder() + dec.applyCacheReset(0xF0) + if dec.dict.size() != 3 { + t.Errorf("dict cleared by unknown bits: size=%d", dec.dict.size()) + } + dec.applyCacheReset(0x02) + if dec.dict.size() != 3 { + t.Errorf("dict cleared by reserved bit 0x02: size=%d", dec.dict.size()) + } + }) +} + +// TestQwpConnDictClearDetachesSnapshot documents the core safety +// invariant of qwpConnDict.clear: a snapshot a user handler is still +// iterating on a prior batch must keep reading the original bytes, +// even after clear() followed by a fresh appendDelta fills the new +// backing array. [:0] reuse would fail this test because the new +// symbols would overwrite the old heap region the snapshot aliases. +func TestQwpConnDictClearDetachesSnapshot(t *testing.T) { + var dict qwpConnDict + + // Prime with three symbols. + seedBytes := buildDeltaBytes(0, []string{"AAPL", "MSFT", "GOOG"}) + var br qwpByteReader + br.reset(seedBytes) + if err := dict.appendDelta(&br); err != nil { + t.Fatalf("seed appendDelta: %v", err) + } + + // Take a snapshot — simulates a user handler iterating a batch. + snap := dict.snapshot() + + // Reset and append different symbols — these must land in a fresh + // backing array so the snapshot's heap remains untouched. + dict.clear() + replacementBytes := buildDeltaBytes(0, []string{"ZZZZ", "YYYY", "XXXX"}) + br.reset(replacementBytes) + if err := dict.appendDelta(&br); err != nil { + t.Fatalf("post-clear appendDelta: %v", err) + } + + want := []string{"AAPL", "MSFT", "GOOG"} + for i, w := range want { + e := snap.entries[i] + got := string(snap.heap[e.offset : e.offset+e.length]) + if got != w { + t.Fatalf("snapshot[%d] = %q, want %q (clear did not detach snapshot)", i, got, w) + } + } +} + +// TestQwpConnDictClearPreservesCapacity checks that clear() retains +// the backing-array capacity so a workload that churns just above the +// server's soft cap does not reallocate on every CACHE_RESET. The +// invariant matches the Java client's QwpResultBatchDecoder comment +// on applyCacheReset. +func TestQwpConnDictClearPreservesCapacity(t *testing.T) { + var dict qwpConnDict + // Grow the dict to a non-trivial size so cap is well above the + // initial empty. + var br qwpByteReader + br.reset(buildDeltaBytes(0, []string{"AAAA", "BBBB", "CCCC", "DDDD"})) + if err := dict.appendDelta(&br); err != nil { + t.Fatalf("seed appendDelta: %v", err) + } + heapCapBefore := cap(dict.heap) + entriesCapBefore := cap(dict.entries) + if heapCapBefore == 0 || entriesCapBefore == 0 { + t.Fatalf("precondition: caps must be non-zero (heap=%d entries=%d)", + heapCapBefore, entriesCapBefore) + } + + dict.clear() + + if cap(dict.heap) < heapCapBefore { + t.Errorf("heap cap shrunk after clear: before=%d after=%d", + heapCapBefore, cap(dict.heap)) + } + if cap(dict.entries) < entriesCapBefore { + t.Errorf("entries cap shrunk after clear: before=%d after=%d", + entriesCapBefore, cap(dict.entries)) + } + if len(dict.heap) != 0 || len(dict.entries) != 0 { + t.Errorf("cleared dict not empty: heap=%d entries=%d", + len(dict.heap), len(dict.entries)) + } +} + +// TestQwpConnDictRejectsOversizedDeltaCount verifies the per-connection +// entry-count cap blocks a hostile (or buggy) server frame that would +// otherwise grow the dict past the bound the uint32 entry offset assumes. +func TestQwpConnDictRejectsOversizedDeltaCount(t *testing.T) { + var dict qwpConnDict + var buf bytes.Buffer + putVarintBytes(&buf, 0) // deltaStart + // One past the cap — server-side framing must reject this even + // before we try to allocate. + putVarintBytes(&buf, uint64(qwpMaxConnDictSize)+1) + var br qwpByteReader + br.reset(buf.Bytes()) + err := dict.appendDelta(&br) + if err == nil || !strings.Contains(err.Error(), "out of range") { + t.Fatalf("expected out-of-range error, got %v", err) + } +} + +// TestQwpConnDictRejectsOversizedHeap verifies the per-connection heap +// cap blocks a single delta entry whose length would push the heap past +// the cap. Tests with a synthetic short header — the appendDelta loop +// must check before allocating, since uint32 offset overflow on the +// next entry would be silent corruption. +func TestQwpConnDictRejectsOversizedHeap(t *testing.T) { + var dict qwpConnDict + var buf bytes.Buffer + putVarintBytes(&buf, 0) // deltaStart + putVarintBytes(&buf, 1) // deltaCount = 1 + // Advertise an entry length larger than the heap cap. Only the + // header is read; the loop must reject on the cap check before + // looking at the body. + putVarintBytes(&buf, uint64(qwpMaxConnDictHeapBytes)+1) + var br qwpByteReader + br.reset(buf.Bytes()) + err := dict.appendDelta(&br) + if err == nil || !strings.Contains(err.Error(), "exceeds cap") { + t.Fatalf("expected exceeds-cap error, got %v", err) + } +} + +// buildDeltaBytes emits a (deltaStart + deltaCount + per-entry +// len+bytes) block as appendDelta expects to read. +func buildDeltaBytes(deltaStart int, entries []string) []byte { + var buf bytes.Buffer + putVarintBytes(&buf, uint64(deltaStart)) + putVarintBytes(&buf, uint64(len(entries))) + for _, s := range entries { + putVarintBytes(&buf, uint64(len(s))) + buf.WriteString(s) + } + return buf.Bytes() +} + +func assertDecodeErrContains(t *testing.T, err error, substr string) { + t.Helper() + if err == nil { + t.Fatalf("expected error containing %q, got nil", substr) + } + if !strings.Contains(err.Error(), substr) { + t.Fatalf("error %q does not contain %q", err.Error(), substr) + } +} + +func containsAny(haystack string, needles []string) bool { + for _, n := range needles { + if strings.Contains(haystack, n) { + return true + } + } + return false +} + +// typeCodeName is a test-local pretty-printer for qwpTypeCode values, +// kept as a free function so it doesn't attach a String() method to +// the production type (which would alter fmt.%v output during tests). +func typeCodeName(t qwpTypeCode) string { + switch t { + case qwpTypeVarchar: + return "VARCHAR" + case qwpTypeBinary: + return "BINARY" + case qwpTypeSymbol: + return "SYMBOL" + default: + return "TYPE_" + itoa(int(t)) + } +} + +// --- zstd helpers --- + +// zstdCompressForTest compresses src using a real klauspost encoder +// configured to always write the FrameContentSize field, matching what +// the server's libzstd encoder produces with its default +// ZSTD_c_contentSizeFlag=on. +// +// WithSingleSegment(true) is required because klauspost omits the FCS +// field for frames <256 bytes in multi-segment mode (see frameenc.go). +// libzstd has no such behavior — when the source size is known it +// always emits FCS. Without the flag our small test payloads would +// produce HasFCS=false frames, which the decoder correctly rejects as +// a protocol violation, but that is not what we want to exercise on +// the happy path. SingleSegment changes no decoded bytes — only the +// presence of the FCS field. +func zstdCompressForTest(t *testing.T, src []byte) []byte { + t.Helper() + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedDefault), + zstd.WithEncoderConcurrency(1), + zstd.WithSingleSegment(true), + ) + if err != nil { + t.Fatalf("zstd.NewWriter: %v", err) + } + defer enc.Close() + return enc.EncodeAll(src, nil) +} + +// compressResultBatchBody rewrites a RESULT_BATCH frame so its +// post-prelude body is zstd-compressed. Input must be the raw output +// of wrapAsResultBatch (uncompressed). The header's FLAG_ZSTD bit is +// set and the payload-length field is rewritten to reflect the +// shorter compressed body. +// +// Layout mirrors QwpWebSocketEncoder.java: +// +// [12 header (FLAG_ZSTD set)] [msg_kind:1] [requestId:8] [batchSeq:varint] [ZSTD(delta + table block)] +func compressResultBatchBody(t *testing.T, frame []byte) []byte { + t.Helper() + if len(frame) < qwpHeaderSize+1+8+1 { + t.Fatalf("compressResultBatchBody: frame too short (%d)", len(frame)) + } + // Re-parse the prelude to know where the compressible body starts. + // msg_kind(1) + requestId(8) + batchSeq(varint) + p := qwpHeaderSize + if frame[p] != byte(qwpMsgKindResultBatch) { + t.Fatalf("compressResultBatchBody: msg_kind = 0x%02X, want RESULT_BATCH", + frame[p]) + } + p += 1 + 8 + _, n := binary.Uvarint(frame[p:]) + if n <= 0 { + t.Fatalf("compressResultBatchBody: bad batchSeq varint at offset %d", p) + } + p += n + + prelude := frame[qwpHeaderSize:p] + body := frame[p:] + compressed := zstdCompressForTest(t, body) + + out := make([]byte, 0, qwpHeaderSize+len(prelude)+len(compressed)) + out = append(out, frame[:qwpHeaderSize]...) + out = append(out, prelude...) + out = append(out, compressed...) + // Set FLAG_ZSTD on the header and patch the payload length. + out[qwpHeaderOffsetFlags] |= qwpFlagZstd + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], + uint32(len(out)-qwpHeaderSize)) + return out +} + +// --- zstd decoder tests --- + +func TestQwpDecoderZstdHappyPath(t *testing.T) { + // Encoder-driven positive path: build a real RESULT_BATCH with a + // handful of rows, compress the body with klauspost's zstd encoder, + // and decode through the production decompression path. Asserts + // every typed accessor reads the same values as the uncompressed + // reference. + tb := newQwpTableBuffer("t") + for _, v := range []int64{1, -2, 1234567890, math.MaxInt64} { + col, err := tb.getOrCreateColumn("x", qwpTypeLong, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(v) + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTable(tb) + raw := wrapAsResultBatch(ingress, 42, 0) + compressed := compressResultBatchBody(t, raw) + + if compressed[qwpHeaderOffsetFlags]&qwpFlagZstd == 0 { + t.Fatal("compressResultBatchBody did not set FLAG_ZSTD") + } + if len(compressed) >= len(raw) { + // Small frames may not compress — we still want to assert the + // decoder succeeds either way. Log for visibility. + t.Logf("compressed frame (%d bytes) >= raw (%d bytes)", + len(compressed), len(raw)) + } + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + if err := dec.decode(compressed, &b); err != nil { + t.Fatalf("decode(zstd): %v", err) + } + if b.RequestId() != 42 { + t.Fatalf("RequestId = %d, want 42", b.RequestId()) + } + if b.BatchSeq() != 0 { + t.Fatalf("BatchSeq = %d, want 0", b.BatchSeq()) + } + if b.RowCount() != 4 { + t.Fatalf("RowCount = %d, want 4", b.RowCount()) + } + for i, w := range []int64{1, -2, 1234567890, math.MaxInt64} { + if got := b.Int64(0, i); got != w { + t.Fatalf("Int64[%d] = %d, want %d", i, got, w) + } + } + if len(b.zstdScratch) == 0 { + t.Fatal("zstdScratch empty after compressed decode") + } +} + +func TestQwpDecoderZstdReusesScratchAcrossDecodes(t *testing.T) { + // Decode two compressed batches into the SAME QwpColumnBatch. + // The decoder's zstd scratch is per-batch (on QwpColumnBatch, not + // on the decoder), so batch N+1's decompressed bytes must land in + // the same backing array as batch N — growing only if N+1 needs + // more capacity. + build := func(v int64, batchSeq uint64) []byte { + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn("x", qwpTypeLong, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(v) + tb.commitRow() + var enc qwpEncoder + ingress := enc.encodeTable(tb) + raw := wrapAsResultBatch(ingress, 1, batchSeq) + return compressResultBatchBody(t, raw) + } + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + + // Batch 0 carries the schema; the decoder holds it for the query. + if err := dec.decode(build(111, 0), &b); err != nil { + t.Fatalf("first decode: %v", err) + } + if got := b.Int64(0, 0); got != 111 { + t.Fatalf("first Int64 = %d, want 111", got) + } + scratchCap0 := cap(b.zstdScratch) + + // Batch 1 is a continuation (no inline schema); it reuses batch 0's. + if err := dec.decode(build(222, 1), &b); err != nil { + t.Fatalf("second decode: %v", err) + } + if got := b.Int64(0, 0); got != 222 { + t.Fatalf("second Int64 = %d, want 222", got) + } + // Capacity should not shrink; rarely grows because batch 2 is the + // same shape as batch 1. Either outcome is valid — this asserts the + // amortisation invariant (cap is at least what we had before). + if cap(b.zstdScratch) < scratchCap0 { + t.Fatalf("zstdScratch cap shrank: was %d, now %d", + scratchCap0, cap(b.zstdScratch)) + } +} + +func TestQwpDecoderZstdHardening(t *testing.T) { + // Build a reusable uncompressed frame that every subtest derives + // from via surgical header / body mutation. + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn("x", qwpTypeLong, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(99) + tb.commitRow() + var enc qwpEncoder + baseRaw := wrapAsResultBatch(enc.encodeTable(tb), 1, 0) + + t.Run("InvalidZstdFrame", func(t *testing.T) { + // FLAG_ZSTD set but the body is plain (uncompressed) bytes — + // the Header.Decode call rejects. + frame := make([]byte, len(baseRaw)) + copy(frame, baseRaw) + frame[qwpHeaderOffsetFlags] |= qwpFlagZstd + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "invalid zstd frame header") + }) + + t.Run("TruncatedZstdStream", func(t *testing.T) { + // Compress the body, then truncate the final byte so zstd + // DecodeAll fails mid-stream. Header.Decode still succeeds + // because the header lives at the front of the frame. + frame := compressResultBatchBody(t, baseRaw) + frame = frame[:len(frame)-1] + // Patch payload length to reflect the shorter body. + binary.LittleEndian.PutUint32(frame[qwpHeaderOffsetPayloadLen:], + uint32(len(frame)-qwpHeaderSize)) + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err := dec.decode(frame, &b) + assertDecodeErrContains(t, err, "zstd decompression failed") + }) + + t.Run("MissingContentSize", func(t *testing.T) { + // A streaming zstd encoder with no pre-declared size writes + // a frame where HasFCS=false. Protocol violation per Java — + // the decoder must reject without decompressing. + // + // Build the body as usual but run it through NewWriter + + // Write + Close rather than EncodeAll. + p := qwpHeaderSize + 1 + 8 + _, n := binary.Uvarint(baseRaw[p:]) + p += n + body := baseRaw[p:] + + var cbuf bytes.Buffer + w, err := zstd.NewWriter(&cbuf, + zstd.WithEncoderLevel(zstd.SpeedDefault), + zstd.WithEncoderConcurrency(1), + ) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Write(body); err != nil { + t.Fatalf("Write: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + compressed := cbuf.Bytes() + + // Sanity: verify the streaming writer really didn't set FCS. + // If klauspost changes its behavior in a future version we + // want to know here instead of in a confusing test failure + // downstream. + var hdr zstd.Header + if err := hdr.Decode(compressed); err != nil { + t.Fatalf("streaming zstd header.Decode: %v", err) + } + if hdr.HasFCS { + t.Skip("streaming zstd writer emitted HasFCS=true; skipping") + } + + out := make([]byte, 0, p+len(compressed)) + out = append(out, baseRaw[:p]...) + out = append(out, compressed...) + out[qwpHeaderOffsetFlags] |= qwpFlagZstd + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], + uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err = dec.decode(out, &b) + assertDecodeErrContains(t, err, "zstd frame missing content size") + }) + + t.Run("ContentSizeExceedsCap", func(t *testing.T) { + // Hand-craft a minimal but valid zstd frame whose FCS field + // declares a size just above the client cap. Header.Decode + // must accept the header (FCS >64 MiB is valid zstd); the + // decoder must reject before calling DecodeAll. + // + // zstd frame shape (RFC 8478 §3.1): + // magic(4) = 0xFD2FB528 + // frame_header_descriptor(1) = 0b11100000 + // (frame_content_size_flag=3 → 8-byte FCS, + // single_segment=1, no dict, no checksum) + // frame_content_size(8) = huge + // ... (blocks follow, but Header.Decode only needs the + // prelude) + const hugeFCS = uint64(qwpZstdMaxDecompressedSize) + 1 + hdr := make([]byte, 0, 13) + hdr = binary.LittleEndian.AppendUint32(hdr, 0xFD2FB528) + hdr = append(hdr, 0b11100000) + hdr = binary.LittleEndian.AppendUint64(hdr, hugeFCS) + // Add a single "raw" block (3-byte header signaling 0-byte + // last raw block) so Header.Decode succeeds on bounded input. + // block_header: last=1, block_type=raw(0), block_size=0 + hdr = append(hdr, 0x01, 0x00, 0x00) + + // Splice into a frame. + p := qwpHeaderSize + 1 + 8 + _, n := binary.Uvarint(baseRaw[p:]) + p += n + out := make([]byte, 0, p+len(hdr)) + out = append(out, baseRaw[:p]...) + out = append(out, hdr...) + out[qwpHeaderOffsetFlags] |= qwpFlagZstd + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], + uint32(len(out)-qwpHeaderSize)) + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "exceeds client cap") + }) + + t.Run("EmptyCompressedPayload", func(t *testing.T) { + // FLAG_ZSTD set but there is nothing after the prelude. Not + // a hostile case — just a wire-level bug we must surface + // instead of calling Header.Decode on zero bytes. + p := qwpHeaderSize + 1 + 8 + _, n := binary.Uvarint(baseRaw[p:]) + p += n + out := make([]byte, p) + copy(out, baseRaw[:p]) + out[qwpHeaderOffsetFlags] |= qwpFlagZstd + binary.LittleEndian.PutUint32(out[qwpHeaderOffsetPayloadLen:], + uint32(p-qwpHeaderSize)) + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + err := dec.decode(out, &b) + assertDecodeErrContains(t, err, "FLAG_ZSTD set but no compressed payload") + }) +} + +func TestQwpDecoderZstdCloseIsIdempotent(t *testing.T) { + // decoder.close() must be safe to call more than once and must + // cope with a never-initialised zstd decoder. Exercises the nil + // branch of the close path. + dec := newTestQueryDecoder() + dec.close() + dec.close() +} + +func TestQwpColumnBatchCopyAllZstdSurvivesPoolReuse(t *testing.T) { + // CopyAll must deep-clone the zstd scratch so a snapshot stays + // valid after the decoder reuses the source batch's scratch for a + // later frame. Without the clone + alias-translation branch in + // CopyAll, the snapshot's byte-aliasing slices would drift onto + // garbage bytes. + buildStrings := func(values []string, batchSeq uint64) []byte { + tb := newQwpTableBuffer("t") + for _, v := range values { + col, err := tb.getOrCreateColumn("s", qwpTypeVarchar, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addString(v) + tb.commitRow() + } + var enc qwpEncoder + ingress := enc.encodeTable(tb) + raw := wrapAsResultBatch(ingress, 1, batchSeq) + return compressResultBatchBody(t, raw) + } + + dec := newTestQueryDecoder() + defer dec.close() + var b QwpColumnBatch + if err := dec.decode(buildStrings([]string{"hello", "world"}, 0), &b); err != nil { + t.Fatalf("first decode: %v", err) + } + snap := b.CopyAll() + if got := snap.String(0, 0); got != "hello" { + t.Fatalf("snap[0] = %q, want %q", got, "hello") + } + + // Decode a second batch (a continuation, reusing batch 0's schema) + // into the SAME b. The decoder reuses b.zstdScratch — without the + // deep-clone in CopyAll the snapshot would now see the second + // batch's bytes. + if err := dec.decode(buildStrings([]string{"x", "y"}, 1), &b); err != nil { + t.Fatalf("second decode: %v", err) + } + if got := snap.String(0, 0); got != "hello" { + t.Fatalf("snap[0] after reuse = %q, want %q (CopyAll didn't clone scratch)", + got, "hello") + } + if got := snap.String(0, 1); got != "world" { + t.Fatalf("snap[1] after reuse = %q, want %q", + got, "world") + } +} diff --git a/qwp_query_errors.go b/qwp_query_errors.go new file mode 100644 index 00000000..2b5e413f --- /dev/null +++ b/qwp_query_errors.go @@ -0,0 +1,231 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "fmt" + "strings" +) + +// QwpQueryError is a server-side error reported during query egress. It +// corresponds to a QUERY_ERROR frame (msg_kind 0x13) and is distinct from +// QwpError, which carries ingress ACK status. CANCELLED and LIMIT_EXCEEDED +// are egress-specific statuses that surface here. +type QwpQueryError struct { + // RequestId correlates the error with the query that produced it. + RequestId int64 + + // Status is the server-reported egress status byte (e.g. + // qwpStatusCancelled, qwpStatusLimitExceeded, QwpStatusParseError). + Status QwpStatusCode + + // Message is the server-supplied UTF-8 description, or empty if the + // server sent a zero-length message. + Message string +} + +// Error implements the error interface. +func (e *QwpQueryError) Error() string { + name := qwpStatusName(e.Status) + if e.Message != "" { + return fmt.Sprintf("qwp: query error %s (0x%02X): %s", + name, byte(e.Status), e.Message) + } + return fmt.Sprintf("qwp: query error %s (0x%02X)", name, byte(e.Status)) +} + +// QwpRoleMismatchError is returned by QwpQueryClient construction when +// none of the configured endpoints satisfies the target= role filter. +// The connect walk records the most-recently-observed SERVER_INFO and +// the last underlying transport failure so callers can distinguish +// "no matching role available" (LastObserved non-nil; an endpoint +// reported a role the filter rejects), "all endpoints unreachable" +// (LastTransportError non-nil with LastObserved nil), and combinations +// of the above (e.g. one endpoint dialled but reported the wrong role +// while another refused the connection). +type QwpRoleMismatchError struct { + // Target is the requested role filter ("any", "primary", "replica"). + // Stored as a string for human-readable error formatting; the + // QwpTargetFilter value is mapped to its name on construction. + Target string + + // LastObserved is the SERVER_INFO of the most recent endpoint the + // connect walk reached and that returned a role this filter would + // reject. Nil if every endpoint refused the connection before + // reporting a role. + LastObserved *QwpServerInfo + + // LastTransportError is the most recent transport-level failure the + // connect walk hit (TCP/TLS dial, WebSocket upgrade, SERVER_INFO + // timeout). Populated when at least one endpoint failed before + // reaching the role-filter step. Nil when every endpoint dialled + // cleanly but failed only the role check. Available via + // errors.Is / errors.As through Unwrap. + LastTransportError error + + // Endpoints lists every endpoint the walk attempted, in the order + // they were tried. Useful for diagnosing why none of them matched. + Endpoints []string +} + +// Error implements the error interface. +func (e *QwpRoleMismatchError) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "qwp query: no endpoint matches target=%s", e.Target) + if e.LastObserved != nil { + fmt.Fprintf(&b, "; last observed role=%s", e.LastObserved.RoleName()) + if e.LastObserved.NodeId != "" { + fmt.Fprintf(&b, " on node %q", e.LastObserved.NodeId) + } + } + if e.LastTransportError != nil { + fmt.Fprintf(&b, "; last transport error: %v", e.LastTransportError) + } + if len(e.Endpoints) > 0 { + fmt.Fprintf(&b, " (tried: %s)", strings.Join(e.Endpoints, ", ")) + } + return b.String() +} + +// Unwrap exposes the underlying transport failure (if any) to +// errors.Is / errors.As so callers can match on both the role-mismatch +// shape and the specific dial / upgrade failure that contributed to it. +// Returns nil when every endpoint reached the role-filter step. +func (e *QwpRoleMismatchError) Unwrap() error { + return e.LastTransportError +} + +// QwpFailoverReset is yielded as a non-fatal error by *QwpQuery.Batches +// when the I/O layer detects a transport-terminal failure mid-query +// and successfully reconnects to another role-matching endpoint to +// replay the request. Subsequent batches arrive with batch_seq +// restarting at 0 on the new node. +// +// Consumer pattern: detect via errors.As, discard any rows accumulated +// from the prior connection, and continue iterating. Consumers that +// don't accumulate (simple "print rows" loops) can ignore the error +// and just continue. Treating it as terminal is also safe — the user +// gets a clear human-readable error and the iterator's deferred +// cleanup tears down the dying generation. +// +// Surfaced only on the Query (SELECT) path. Exec never returns this: +// with replay_exec=off (the default) a transport drop yields the raw +// transport error without reconnecting — so a non-idempotent +// statement the server may already have applied is not silently +// re-executed — and with replay_exec=on Exec replays transparently +// and consumes the reset internally. +type QwpFailoverReset struct { + // NewNode is the SERVER_INFO of the endpoint the client just + // rebound to (nil only if no SERVER_INFO was available). + NewNode *QwpServerInfo + + // Attempt is the 1-based replay attempt counter. Attempt=1 means + // the failure happened during the original submission and the + // first reconnect succeeded; Attempt=N means N transport failures + // occurred before this reset. + Attempt int + + // LastError is the underlying transport-terminal error that + // triggered this reset. Useful for diagnostics; nil only on the + // rare case of a server-initiated reconnect with no transport + // fault. + LastError error +} + +// Error implements the error interface. +func (e *QwpFailoverReset) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "qwp query: failover reset (attempt %d)", e.Attempt) + if e.NewNode != nil { + fmt.Fprintf(&b, " to %s/%s", e.NewNode.NodeId, e.NewNode.RoleName()) + } + if e.LastError != nil { + fmt.Fprintf(&b, ": %v", e.LastError) + } + return b.String() +} + +// Unwrap exposes the underlying transport error to errors.Is / +// errors.As so callers can match on both the reset event and the +// specific transport failure that triggered it. +func (e *QwpFailoverReset) Unwrap() error { + return e.LastError +} + +// QwpFailoverExhaustedError surfaces from *QwpQuery.Batches and +// (*QwpQueryClient).Exec when the failover budget +// (failover_max_attempts) has been consumed without producing a +// successful query completion. Carries the attempt count and the most +// recent transport-terminal error so callers can distinguish "the +// initial attempt failed" from "every retry within the budget also +// failed", and surface a useful diagnostic without parsing the +// underlying message. Mirrors Java's onError(STATUS_INTERNAL_ERROR, +// "transport failure after N execute attempts ...") shape from +// QwpQueryClient.executeOnce. +type QwpFailoverExhaustedError struct { + // Attempts is the number of execute attempts (initial submission + // plus all replays) that failed before the budget was reached. + // Always equal to the configured failover_max_attempts when the + // error is constructed by the session orchestrator; preserved as + // a separate field so a caller-side log line does not need to + // re-derive it from configuration. + Attempts int + + // LastError is the most recent transport-terminal error that + // pushed the count up to the budget. Non-nil. Available via + // errors.Is / errors.As through Unwrap so callers can match on + // both the exhaustion shape and the specific underlying cause. + LastError error +} + +// Error implements the error interface. +func (e *QwpFailoverExhaustedError) Error() string { + failovers := e.Attempts - 1 + if failovers < 0 { + failovers = 0 + } + var b strings.Builder + fmt.Fprintf(&b, "qwp query: failover exhausted after %d execute attempt", + e.Attempts) + if e.Attempts != 1 { + b.WriteByte('s') + } + fmt.Fprintf(&b, " (%d failover reconnect", failovers) + if failovers != 1 { + b.WriteByte('s') + } + b.WriteByte(')') + if e.LastError != nil { + fmt.Fprintf(&b, "; last error: %v", e.LastError) + } + return b.String() +} + +// Unwrap exposes the underlying transport error to errors.Is / +// errors.As so callers can match on both the exhaustion shape and the +// specific transport failure that triggered the final retry. +func (e *QwpFailoverExhaustedError) Unwrap() error { + return e.LastError +} diff --git a/qwp_query_failover.go b/qwp_query_failover.go new file mode 100644 index 00000000..b3fd96e9 --- /dev/null +++ b/qwp_query_failover.go @@ -0,0 +1,751 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "math/rand" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +// qwpDefaultPort is the port applied to addr= entries that omit one. +// Matches Java QwpQueryClient.DEFAULT_WS_PORT and the live server's +// default HTTP/WebSocket bind. Single source of truth so the live +// integration tests and the connection-string parser cannot drift. +const qwpDefaultPort = 9000 + +// qwpEndpoint is one address on the connect-walk list. Distinct from a +// raw "host:port" string so callers can stream the same endpoint +// through validate / hostport / debug paths without re-parsing. +type qwpEndpoint struct { + host string + port int +} + +// String formats the endpoint as host:port, bracketing IPv6 hosts so +// downstream consumers can re-parse the form without ambiguity. +func (e qwpEndpoint) String() string { + if strings.Contains(e.host, ":") { + return fmt.Sprintf("[%s]:%d", e.host, e.port) + } + return fmt.Sprintf("%s:%d", e.host, e.port) +} + +// QwpTargetFilter constrains the connect walk to endpoints whose +// SERVER_INFO.role passes the filter. The argument type of WithTarget +// (ingest) and WithQwpQueryTarget (egress); use the QwpTarget* +// constants to name a value. Mirrors Java QwpQueryClient's +// TARGET_ANY/PRIMARY/REPLICA constants. Zero value is QwpTargetAny so +// tests and config defaults can use the zero-init pattern naturally. +type QwpTargetFilter byte + +const ( + // qwpTargetAny accepts any role. The default; matches Java's + // TARGET_ANY. Used when callers only want any reachable endpoint. + qwpTargetAny QwpTargetFilter = iota + // qwpTargetPrimary accepts STANDALONE, PRIMARY, and PRIMARY_CATCHUP. + // STANDALONE is included so single-node OSS deployments (which do + // not configure replication) are not accidentally excluded. + qwpTargetPrimary + // qwpTargetReplica accepts only REPLICA. Use when read latency is + // secondary to offloading the primary. + qwpTargetReplica +) + +// Exported names for the QwpTargetFilter constants, so callers of +// WithTarget / WithQwpQueryTarget can name the values. Equivalent to +// the connect-string target=any|primary|replica values. +const ( + // QwpTargetAny accepts any reachable endpoint regardless of role. + // The default; equivalent to target=any (or omitting the key). + QwpTargetAny = qwpTargetAny + // QwpTargetPrimary routes only to STANDALONE / PRIMARY / + // PRIMARY_CATCHUP endpoints; equivalent to target=primary. + QwpTargetPrimary = qwpTargetPrimary + // QwpTargetReplica routes only to REPLICA endpoints; equivalent + // to target=replica. + QwpTargetReplica = qwpTargetReplica +) + +// String returns the connection-string form for diagnostics and error +// messages. +func (t QwpTargetFilter) String() string { + switch t { + case qwpTargetAny: + return "any" + case qwpTargetPrimary: + return "primary" + case qwpTargetReplica: + return "replica" + default: + return fmt.Sprintf("unknown(%d)", byte(t)) + } +} + +// parseTargetFilter maps the connection-string value to the enum. +// Empty input normalises to qwpTargetAny so parsers that assemble the +// effective config from multiple sources can use absence-as-default +// without a dedicated branch. Mirrors Java's +// QwpQueryClient.fromConfig target validation. +func parseTargetFilter(s string) (QwpTargetFilter, error) { + switch s { + case "", "any": + return qwpTargetAny, nil + case "primary": + return qwpTargetPrimary, nil + case "replica": + return qwpTargetReplica, nil + default: + return 0, fmt.Errorf( + "qwp query: invalid target %q (expected any, primary, or replica)", s) + } +} + +// accepts reports whether the given role byte passes the filter. +// Mirrors Java QwpQueryClient.matchesTarget exactly: primary accepts +// STANDALONE so OSS deployments (which advertise STANDALONE rather +// than PRIMARY) are treated as primaries for routing purposes. +func (t QwpTargetFilter) accepts(role byte) bool { + switch t { + case qwpTargetAny: + return true + case qwpTargetPrimary: + return role == qwpRoleStandalone || + role == qwpRolePrimary || + role == qwpRolePrimaryCatchup + case qwpTargetReplica: + return role == qwpRoleReplica + default: + return false + } +} + +// parseEndpointList splits a comma-separated addr= value into typed +// endpoints. Defers per-endpoint validation to splitQwpHostPort and +// the explicit port-range check; rejects the empty string and any +// element that fails parsing. Surfaces errors with the original +// element so a malformed entry in the middle of the list is easy to +// pinpoint. +// +// defaultPort is applied when an entry omits :port. Use +// defaultHttpPort (9000) for the QWP defaults; tests pass an explicit +// number when they need a different default. +func parseEndpointList(s string, defaultPort int) ([]qwpEndpoint, error) { + if s == "" { + return nil, fmt.Errorf("qwp query: addr is empty") + } + parts := strings.Split(s, ",") + out := make([]qwpEndpoint, 0, len(parts)) + for _, p := range parts { + entry := strings.TrimSpace(p) + if entry == "" { + return nil, fmt.Errorf("qwp query: empty entry in addr list %q", s) + } + host, portStr, err := splitQwpHostPort(entry) + if err != nil { + return nil, fmt.Errorf("qwp query: invalid addr %q: %w", entry, err) + } + if host == "" { + return nil, fmt.Errorf("qwp query: invalid addr %q: empty host", entry) + } + port := defaultPort + if portStr != "" { + n, err := strconv.Atoi(portStr) + if err != nil { + return nil, fmt.Errorf( + "qwp query: invalid addr %q: invalid port %q", entry, portStr) + } + if n < 1 || n > 65535 { + return nil, fmt.Errorf( + "qwp query: invalid addr %q: port %d out of range [1, 65535]", + entry, n) + } + port = n + } + out = append(out, qwpEndpoint{host: host, port: port}) + } + return out, nil +} + +// qwpConnectResult bundles everything connectWalk produces on success: +// a live transport + I/O goroutine pair, the index of the bound +// endpoint in cfg.endpoints, and the SERVER_INFO from the bound +// connection. Returned to the caller (newQwpQueryClient or the +// failover orchestrator) so the client struct can publish all three +// atomically. +type qwpConnectResult struct { + transport *qwpTransport + io *qwpEgressIO + endpointIdx int + serverInfo *QwpServerInfo +} + +// connectWalk is the egress WalkTracker helper (wire-egress.md +// §11.9.3), shared by the initial connect (newQwpQueryClient) and +// every failover reconnect (reconnectAndReplay). Endpoint selection is +// driven by the failover.md §2 host-health tracker, NOT a positional +// walk: tracker.PickNext returns the lexicographically-best +// (state, zone) candidate, the dial outcome is fed back via +// RecordSuccess / RecordRoleReject / RecordTransportError / RecordZone, +// and a single fall-through BeginRound(forgetClassifications=true) +// reset gives stale TransientReject / TopologyReject hosts one more +// chance before the walk gives up. This replaces the pre-failover-spec +// (failedIdx+1)%n modulo round-robin, which ignored host health and +// zone locality entirely (the `zone=` key was inert on the query +// side). +// +// Round entry is the caller's responsibility, per wire-egress.md +// §11.9.2: the initial connect runs on a fresh all-Unknown tracker +// (no BeginRound needed); reconnect calls RecordMidStreamFailure on +// the just-failed index then BeginRound(forgetClassifications=false) +// before invoking this helper. This function owns only the in-walk +// classification and (when allowFallthroughReset is set) the one +// fall-through reset. +// +// allowFallthroughReset gates the single +// BeginRound(forgetClassifications=true) re-sweep that runs when +// PickNext first returns -1. It is true only on the failover +// reconnect path (Java reconnectViaTracker), where forgetting stale +// classifications from prior outages and walking once more lets a +// long-lived client recover from a topology change. It is false on +// the initial connect path (Java connect()), which probes every +// endpoint exactly once and then fails — re-sweeping a freshly +// role-rejecting cluster on first connect would just double every +// endpoint's probe count for no diagnostic gain (Java's +// QwpQueryClientMultiHostFailoverTest.testConnectDoesNotDoubleWalkOnFirstFailure +// pins this). +// +// AuthError (401/403) is terminal per failover.md §6: the helper +// returns the typed *QwpUpgradeRejectError immediately without walking +// to the next host (credentials are cluster-wide; retrying every host +// just floods server logs). All other dial failures are per-endpoint +// and the walk continues. +// +// Closes any partially-bound resources before returning on a failure +// path so callers do not have to worry about leaked goroutines or +// half-open sockets. On a successful return the caller takes +// ownership of the transport + I/O. +// +// cancelCh, when non-nil, is checked at every endpoint boundary to +// short-circuit the walk if the user has asked to cancel. Cancel() +// closes the session's cancelCh but does not cancel the user's ctx, +// so without this check a slow walk would block on +// serverInfoTimeout × len(endpoints) before honouring the cancel. +// The check is at the loop boundary only; it does NOT preempt an +// in-flight Dial / SERVER_INFO read, so the worst-case wait shrinks +// from the full walk to a single endpoint's timeout. Java has the +// same boundary-only granularity. +func connectWalk(ctx context.Context, cfg *qwpQueryClientConfig, tracker *qwpHostTracker, cancelCh <-chan struct{}, allowFallthroughReset bool) (*qwpConnectResult, error) { + if len(cfg.endpoints) == 0 { + return nil, fmt.Errorf("qwp query: no endpoints configured") + } + scheme := "ws" + if cfg.tlsMode != tlsDisabled { + scheme = "wss" + } + endpointStrings := make([]string, len(cfg.endpoints)) + for i, ep := range cfg.endpoints { + endpointStrings[i] = ep.String() + } + + var lastObserved *QwpServerInfo + var lastErr error + attempts := 0 + retriedAfterReset := false + for { + if cancelCh != nil { + select { + case <-cancelCh: + return nil, context.Canceled + default: + } + } + + idx := tracker.PickNext() + if idx < 0 { + // Round exhausted. On the reconnect path, give stale + // TransientReject / TopologyReject / TransportError hosts + // one more shot by forgetting non-Healthy classifications, + // then walk once more. Only one reset, then fail + // (wire-egress.md §11.9.3 — unlike the SF reconnect loop + // there is no wall-clock budget here; the per-Execute loop + // owns that). The initial connect passes + // allowFallthroughReset=false and fails after the single + // sweep. + if allowFallthroughReset && !retriedAfterReset { + tracker.BeginRound(true) + retriedAfterReset = true + continue + } + break + } + ep := cfg.endpoints[idx] + wsURL := scheme + "://" + ep.String() + + tr := &qwpTransport{} + opts := qwpTransportOpts{ + tlsInsecureSkipVerify: cfg.tlsMode == tlsInsecureSkipVerify, + endpointPath: cfg.endpointPath, + authorization: cfg.effectiveAuthorization(), + maxBatchRows: cfg.maxBatchRows, + acceptEncoding: cfg.buildAcceptEncodingHeader(), + // QWP has a single protocol version; advertise it. The + // server always emits SERVER_INFO post-upgrade and the + // egress client reads it (serverInfoTimeout > 0). + maxVersion: qwpVersion, + serverInfoTimeout: cfg.serverInfoTimeout, + authTimeoutMs: cfg.authTimeoutMs, + } + attempts++ + if err := tr.connect(ctx, wsURL, opts); err != nil { + // transport.connect already cleaned up after itself on the + // failure path. Classify per failover.md §5/§6. + var rej *QwpUpgradeRejectError + if errors.As(err, &rej) { + // AuthError 401/403: terminal — bypass failover so a + // cluster-wide bad credential does not flood every host. + if rej.StatusCode == 401 || rej.StatusCode == 403 { + return nil, err + } + // Record the host's zone tier if the reject carried + // X-QuestDB-Zone (no-op on empty / collapsed-to-Same). + if rej.Zone != "" { + tracker.RecordZone(idx, rej.Zone) + } + if rej.IsRoleReject() { + // 421 + non-empty role: transient (PRIMARY_CATCHUP) + // or topology (any other role). + tracker.RecordRoleReject(idx, rej.IsCatchupRole()) + lastErr = err + continue + } + // 421 without role, 404, 426, 503, version mismatch, + // etc.: per-endpoint transient. + tracker.RecordTransportError(idx) + lastErr = err + continue + } + // TCP/TLS dial error, upgrade-response-read timeout, etc. + tracker.RecordTransportError(idx) + lastErr = err + continue + } + + info := tr.serverInfo + if info != nil && info.Capabilities&qwpCapZone != 0 { + // Server advertised its zone on the SERVER_INFO frame. + tracker.RecordZone(idx, info.ZoneId) + } + if info == nil && cfg.target != qwpTargetAny { + // Connected but no SERVER_INFO (serverInfoTimeout disabled, + // or a non-conformant server): the role is unknown, so a + // specific role filter cannot be satisfied without giving the + // caller a false guarantee. Demote to TopologyReject rather + // than binding to an unknown role. + tracker.RecordRoleReject(idx, false) + _ = tr.close() + continue + } + if info != nil && !cfg.target.accepts(info.Role) { + lastObserved = info + // PRIMARY_CATCHUP is catching up and likely to become + // writable; any other mismatch is a stable topology fact. + tracker.RecordRoleReject(idx, info.Role == qwpRolePrimaryCatchup) + _ = tr.close() + continue + } + + // Bound. Stand up the I/O goroutine pair on the heap-stable + // transport pointer and publish. The atomic pointer in the + // client struct allows swapping `tr` independently across + // reconnects without disturbing the IO goroutine's view. + io := newQwpEgressIO(tr, cfg.bufferPoolSize) + io.start() + tracker.RecordSuccess(idx) + return &qwpConnectResult{ + transport: tr, + io: io, + endpointIdx: idx, + serverInfo: tr.serverInfo, + }, nil + } + + if cfg.target == qwpTargetAny { + // No matching endpoint and the filter is permissive — every + // endpoint must have failed the dial. Surface the last + // underlying error so the user sees a useful diagnostic. + if lastErr == nil { + lastErr = fmt.Errorf("qwp query: all endpoints unreachable") + } + return nil, fmt.Errorf("qwp query: connect failed (tried %d endpoints): %w", + attempts, lastErr) + } + // Specific role filter and no match — surface a typed + // QwpRoleMismatchError carrying the last observed SERVER_INFO and + // the last transport error so callers can distinguish "no matching + // role available" (LastObserved non-nil), "all endpoints + // unreachable" (LastTransportError non-nil with LastObserved nil), + // and any combination thereof. + return nil, &QwpRoleMismatchError{ + Target: cfg.target.String(), + LastObserved: lastObserved, + LastTransportError: lastErr, + Endpoints: endpointStrings, + } +} + +// qwpQuerySession orchestrates a single Query / Exec call: submission, +// event consumption, and transparent failover (reconnect + replay) on +// transport-terminal failure. The session owns the retained +// sql / bindPayload / initialCredit / bindCount so a replay attempt +// can reuse them on the new connection without round-tripping through +// the user goroutine. +// +// One session per Query / Exec; not safe for concurrent reuse. Cancel +// is the only method safe to call from another goroutine. +type qwpQuerySession struct { + client *QwpQueryClient + + // Retained request fields. Cleared on successful End / ExecDone / + // Error so a follow-up query on the same client cannot accidentally + // observe them. + sql string + bindPayload []byte + bindCount int + initialCredit int64 + + // currentRequestId tracks the request_id of the in-flight + // generation. Updated atomically each time submit is called: a + // fresh value on the initial submit and on every replay. Cancel + // reads it to send a CANCEL frame for the right generation. + currentRequestId atomic.Int64 + + // replayable gates whether nextEvent is allowed to + // reconnect-and-resubmit on a transport-terminal failure. true + // for Query (SELECT is idempotent — replaying is always safe); + // for Exec it is cfg.replayExec, false by default so a + // non-idempotent INSERT/UPDATE/DELETE/DDL that the server may + // have already applied before the transport drop is never + // silently re-executed on the new connection. When false, + // nextEvent surfaces the raw transport error instead of + // resubmitting (the connection is poisoned; the caller must + // rebuild and decide whether the statement applied). + replayable bool + + // attempt counts executeOnce invocations: 1 on the initial + // submission, 2 after the first replay, etc. Capped by + // cfg.failoverMaxAttempts. + attempt int + + // failoverDeadline is the wall-clock cap on this Query/Exec's + // failover loop, stamped once at session creation (mirrors Java + // computing the deadline before the attempt loop, + // QwpQueryClient.java:1517-1528). Zero means no time cap — + // failover is then bounded only by cfg.failoverMaxAttempts. + failoverDeadline time.Time + + // cancelCh is closed by requestCancel and selected on at every + // reconnect-and-replay boundary so the session does not start a + // fresh attempt after the user has asked for cancellation. A + // closed channel lets sleepInterruptible wake immediately on + // Cancel without polling. cancelOnce guards the close. + cancelCh chan struct{} + cancelOnce sync.Once +} + +// isCancelled reports whether requestCancel has been called. +func (s *qwpQuerySession) isCancelled() bool { + select { + case <-s.cancelCh: + return true + default: + return false + } +} + +// failoverBudgetExpired reports whether the per-Query/Exec wall-clock +// failover budget (failover_max_duration_ms) has elapsed. A zero +// deadline means the budget is disabled — failover is then bounded +// only by cfg.failoverMaxAttempts. Mirrors Java's +// failoverMaxDurationMs == 0 → unbounded (QwpQueryClient.java:1527) +// and the now >= deadline give-up test (QwpQueryClient.java:1541). +func (s *qwpQuerySession) failoverBudgetExpired() bool { + return !s.failoverDeadline.IsZero() && !time.Now().Before(s.failoverDeadline) +} + +// newQwpQuerySession allocates and returns a session bound to client. +// The retained sql / bind payload comes from the supplied req. The +// caller must call submit before nextEvent; submit assigns the initial +// requestId and dispatches the first attempt to the I/O goroutine. +// +// replayable decides whether a transport-terminal failure may be +// recovered by reconnect-and-resubmit: pass true for Query (SELECT is +// idempotent) and cfg.replayExec for Exec (false by default to protect +// non-idempotent statements from double-execution). +func newQwpQuerySession(client *QwpQueryClient, req qwpRequest, replayable bool) *qwpQuerySession { + s := &qwpQuerySession{ + client: client, + sql: req.sql, + bindPayload: req.bindPayload, + bindCount: req.bindCount, + initialCredit: req.initialCredit, + replayable: replayable, + cancelCh: make(chan struct{}), + } + s.currentRequestId.Store(req.requestId) + // Stamp the failover budget deadline once, before the first + // submit, mirroring Java computing failoverDeadlineNanos before + // the attempt loop (QwpQueryClient.java:1517-1528). A zero or + // negative cap leaves failoverDeadline as the zero Time, which + // failoverBudgetExpired treats as "no time cap". + if d := client.cfg.failoverMaxDuration; d > 0 { + s.failoverDeadline = time.Now().Add(d) + } + return s +} + +// submit dispatches the current attempt's qwpRequest to the I/O +// goroutine on the bound generation. Returns the same error +// io.submitQuery would have returned (closed I/O, latched ioErr, +// ctx-cancelled wait). +func (s *qwpQuerySession) submit(ctx context.Context) error { + s.attempt++ + req := qwpRequest{ + sql: s.sql, + requestId: s.currentRequestId.Load(), + initialCredit: s.initialCredit, + bindCount: s.bindCount, + bindPayload: s.bindPayload, + } + return s.client.io().submitQuery(ctx, req) +} + +// requestCancel marks the session cancelled and forwards the cancel +// to the bound I/O goroutine. Safe to call from any goroutine. Closes +// cancelCh first so the failover loop and any in-flight backoff sleep +// short-circuit even if the cancel races a reconnect. +func (s *qwpQuerySession) requestCancel() { + s.cancelOnce.Do(func() { close(s.cancelCh) }) + s.client.io().requestCancel(s.currentRequestId.Load()) +} + +// nextEvent returns the next event from the current generation. On +// qwpEventKindTransportError, runs the reconnect-and-replay loop and +// returns a synthesized qwpEventKindFailoverReset event whose +// failoverReset field carries the new generation's QwpServerInfo. The +// caller's iterator (Batches() / Exec() loop) yields the reset to the +// user, who is expected to discard accumulated state and continue. +// +// When failover is disabled (cfg.failoverEnabled == false), or this +// session is not replayable (a non-idempotent Exec with +// replay_exec=off — see s.replayable), the original transport error +// is returned as-is, WITHOUT reconnecting or resubmitting, so the +// caller surfaces it through the usual error path and the +// possibly-already-applied statement is never re-executed. When the +// failover budget is exhausted (s.attempt >= cfg.failoverMaxAttempts, +// or the failover_max_duration_ms wall-clock budget has elapsed), the +// event is wrapped into a *QwpFailoverExhaustedError so callers can +// errors.As against the exhaustion shape and distinguish "we ran out +// of retries" from "first attempt failed". +func (s *qwpQuerySession) nextEvent(ctx context.Context) (qwpEvent, error) { + ev, err := s.client.io().takeEvent(ctx) + if err != nil { + return ev, err + } + if ev.kind != qwpEventKindTransportError { + return ev, nil + } + // Transport-terminal failure. Decide whether to retry. + if s.isCancelled() { + return ev, nil + } + cfg := s.client.cfg + if !cfg.failoverEnabled { + return ev, nil + } + if !s.replayable { + // Non-idempotent Exec with replay_exec=off. The server may + // have already applied the INSERT/UPDATE/DELETE/DDL before the + // transport dropped, so reconnecting and resubmitting would + // risk a silent second execution. Surface the raw transport + // error instead: the connection is poisoned (loadIoErr is + // latched), the next Query/Exec fails fast, and the caller + // must rebuild the client and decide whether the statement + // took effect. Query is always replayable (SELECT is + // idempotent), so this branch only ever fires for Exec. + return ev, nil + } + if s.attempt >= cfg.failoverMaxAttempts || s.failoverBudgetExpired() { + // Budget exhausted: the attempt cap was reached or the + // failover_max_duration_ms wall-clock budget elapsed. Wrap the + // underlying transport error so callers can errors.As to + // *QwpFailoverExhaustedError and distinguish "we ran out of + // retries" from "first attempt failed". Mirrors Java's + // combined give-up test (attempt >= max || now >= deadline) + // at QwpQueryClient.java:1541, which emits one exhaustion + // message for both causes. + return s.exhaustedEvent(ev), nil + } + lastErr := fmt.Errorf("qwp query: %s", ev.errMessage) + failedIdx := int(s.client.currentEndpointIdx.Load()) + // Backoff (interruptible by ctx and cancel), clamped so the sleep + // never overshoots the failover budget. Mirrors Java + // QwpQueryClient.java:1569-1583: after the jittered delay, + // recompute the remaining budget, give up if it is already spent, + // and otherwise shrink the sleep to what remains. + delay := computeBackoff(s.client.cfg, s.attempt) + if !s.failoverDeadline.IsZero() { + remaining := time.Until(s.failoverDeadline) + if remaining <= 0 { + return s.exhaustedEvent(ev), nil + } + if delay > remaining { + delay = remaining + } + } + if !sleepInterruptible(ctx, s.cancelCh, delay) || s.isCancelled() { + return ev, nil + } + // Re-bind to a different role-matching endpoint and replay. A + // successful return increments s.attempt (via submit) and + // publishes the new generation on the client. + newInfo, replayErr := s.client.reconnectAndReplay(ctx, s, failedIdx) + if replayErr != nil { + if s.isCancelled() { + // Cancel landed during the walk and connectWalk's boundary + // poll short-circuited it. Surface the original transport + // error rather than a connect-failed wrap, matching the + // pre-walk and post-sleep cancel guards above. + return ev, nil + } + // Reconnect failed — surface a transport error wrapping the + // dial failure and the original cause. The caller's next + // iteration will see this and either retry (if the budget + // permits) or surface to the user. Thread the typed replayErr + // (e.g. *QwpRoleMismatchError) so callers can errors.As + // against it on a failover-time mismatch, matching the + // initial-connect path. + return qwpEvent{ + kind: qwpEventKindTransportError, + errMessage: fmt.Sprintf("%v (after %v)", replayErr, lastErr), + transportErr: fmt.Errorf("%w (after %w)", replayErr, lastErr), + }, nil + } + return qwpEvent{ + kind: qwpEventKindFailoverReset, + requestId: s.currentRequestId.Load(), + failoverReset: &QwpFailoverReset{ + NewNode: newInfo, + Attempt: s.attempt, + LastError: lastErr, + }, + }, nil +} + +// exhaustedEvent wraps a terminal transport event into a +// qwpEventKindTransportError event whose typed cause is a +// *QwpFailoverExhaustedError. Used at the point where the failover +// budget has been consumed so the caller can errors.As against the +// exhaustion shape and distinguish it from the first-attempt-failed +// case. Preserves the original event's underlying error (or its +// errMessage when no typed cause was attached) as the LastError so +// errors.Unwrap chains down to the actual transport fault. +func (s *qwpQuerySession) exhaustedEvent(ev qwpEvent) qwpEvent { + cause := ev.transportErr + if cause == nil { + msg := ev.errMessage + if msg == "" { + msg = "qwp query: transport-terminal failure" + } + cause = errors.New(msg) + } + exhausted := &QwpFailoverExhaustedError{ + Attempts: s.attempt, + LastError: cause, + } + return qwpEvent{ + kind: qwpEventKindTransportError, + requestId: ev.requestId, + errMessage: exhausted.Error(), + transportErr: exhausted, + } +} + +// computeBackoff is the full-jitter exponential schedule from +// QwpQueryClient.java:1557-1568. attempt is the 1-based count of +// completed (failed) attempts at the call site — i.e. attempt=1 +// means the initial submission just failed and we are about to +// retry for the first time. The base doubles per step (initial, +// 2*initial, 4*initial, …) until the configured ceiling, then +// full-jitter draws the actual sleep uniformly from [0, base). +// Egress is single-user, so the lowest expected recovery time +// wins over the reconnect-storm damping that equal-jitter buys +// the shared ingress path (failover.md §3.1; ingress jitter in +// qwp_sf_round_walk.go's qwpSfComputeBackoff). attempt < 1, +// initial == 0, or a non-positive cap returns zero (no sleep). +func computeBackoff(cfg *qwpQueryClientConfig, attempt int) time.Duration { + if attempt < 1 || cfg.failoverBackoffInitial == 0 { + return 0 + } + shift := attempt - 1 + if shift > 30 { + shift = 30 + } + d := cfg.failoverBackoffInitial << shift + if d <= 0 || d > cfg.failoverBackoffMax { + d = cfg.failoverBackoffMax + } + if d <= 0 { + return 0 + } + // Full-jitter: [0, base). rand.Int63n requires a positive + // argument; the d > 0 guard above keeps that contract. + return time.Duration(rand.Int63n(int64(d))) +} + +// sleepInterruptible blocks for d, returning early when ctx expires +// or cancelCh is closed. Returns true if the full sleep completed, +// false if interrupted. Zero d returns immediately. +func sleepInterruptible(ctx context.Context, cancelCh <-chan struct{}, d time.Duration) bool { + if d <= 0 { + return true + } + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-timer.C: + return true + case <-ctx.Done(): + return false + case <-cancelCh: + return false + } +} diff --git a/qwp_query_integration_test.go b/qwp_query_integration_test.go new file mode 100644 index 00000000..d8ca78a6 --- /dev/null +++ b/qwp_query_integration_test.go @@ -0,0 +1,599 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "testing" + "time" +) + +// newTestQueryClient opens an egress QwpQueryClient against the live +// local server. Skips the test if the server is unreachable (same +// policy as qwpEnsureServer). +func newTestQueryClient(t *testing.T) *QwpQueryClient { + t.Helper() + qwpEnsureServer(t) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, WithQwpQueryAddress(qwpTestAddr)) + if err != nil { + t.Fatalf("NewQwpQueryClient: %v", err) + } + return c +} + +// insertRows ingests `rows` rows into `tableName` via a QwpSender. +// Used to seed data before exercising the egress query path. +func insertRows(t *testing.T, tableName string, rows int) { + t.Helper() + ctx := context.Background() + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, + qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) + if err != nil { + t.Fatalf("newQwpLineSender: %v", err) + } + defer s.Close(ctx) + base := time.Date(2024, 6, 15, 12, 0, 0, 0, time.UTC) + for i := 0; i < rows; i++ { + err = s.Table(tableName). + Symbol("host", fmt.Sprintf("server%d", i%3)). + Int64Column("v", int64(i)). + At(ctx, base.Add(time.Duration(i)*time.Second)) + if err != nil { + t.Fatalf("At: %v", err) + } + } + if err := s.Flush(ctx); err != nil { + t.Fatalf("Flush: %v", err) + } + qwpWaitForRows(t, tableName, rows) +} + +// TestQwpIntegrationQuerySimpleSelect inserts three rows via ingest, +// queries them via egress, and verifies the iterator yields the +// correct values with TotalRows set from RESULT_END. +func TestQwpIntegrationQuerySimpleSelect(t *testing.T) { + const tableName = "qwp_integ_query_simple" + qwpEnsureServer(t) + qwpDropTable(t, tableName) + defer qwpDropTable(t, tableName) + + insertRows(t, tableName, 3) + + c := newTestQueryClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + defer c.Close(ctx) + + q := c.Query(ctx, fmt.Sprintf("SELECT v FROM '%s' ORDER BY v", tableName)) + defer q.Close() + + var got []int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + rows := batch.RowCount() + for r := 0; r < rows; r++ { + got = append(got, batch.Int64(0, r)) + } + } + if len(got) != 3 { + t.Fatalf("got %d rows, want 3 (values %v)", len(got), got) + } + for i, v := range got { + if v != int64(i) { + t.Errorf("row %d: got v=%d, want %d", i, v, i) + } + } + if q.TotalRows() != 3 { + t.Errorf("TotalRows=%d, want 3", q.TotalRows()) + } +} + +// TestQwpIntegrationQueryError runs a SELECT against a nonexistent +// table and verifies the server's QUERY_ERROR surfaces as a +// *QwpQueryError with a useful message. +func TestQwpIntegrationQueryError(t *testing.T) { + c := newTestQueryClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + defer c.Close(ctx) + + q := c.Query(ctx, "SELECT * FROM qwp_integ_does_not_exist_xyz") + defer q.Close() + + var lastErr error + for _, err := range q.Batches() { + if err != nil { + lastErr = err + } + } + if lastErr == nil { + t.Fatal("expected query error, got nil") + } + var qe *QwpQueryError + if !errors.As(lastErr, &qe) { + t.Fatalf("err type=%T, want *QwpQueryError: %v", lastErr, lastErr) + } + if qe.Message == "" { + t.Errorf("QwpQueryError.Message is empty — expected a server description") + } +} + +// TestQwpIntegrationExecDDL runs a CREATE TABLE via Exec, verifies it +// returns cleanly, then drops the table and checks the DROP also +// works through Exec. +func TestQwpIntegrationExecDDL(t *testing.T) { + const tableName = "qwp_integ_exec_ddl" + qwpEnsureServer(t) + qwpDropTable(t, tableName) // ensure clean slate + defer qwpDropTable(t, tableName) + + c := newTestQueryClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + defer c.Close(ctx) + + createSQL := fmt.Sprintf( + "CREATE TABLE '%s' (ts TIMESTAMP, v LONG) TIMESTAMP(ts) PARTITION BY DAY WAL", + tableName) + if _, err := c.Exec(ctx, createSQL); err != nil { + t.Fatalf("Exec(CREATE): %v", err) + } + // Verify via the HTTP exec endpoint that the table now exists. + res := qwpQuery(t, fmt.Sprintf("SELECT count() FROM '%s'", tableName)) + if res.Count != 1 || len(res.Dataset) == 0 { + t.Errorf("CREATE TABLE did not produce a table: %+v", res) + } +} + +// TestQwpIntegrationQueryFromConf exercises the ws:: config-string +// entry point, proving QwpQueryClientFromConf dials the live server +// with the same behavior as the functional-options constructor. +func TestQwpIntegrationQueryFromConf(t *testing.T) { + const tableName = "qwp_integ_query_fromconf" + qwpEnsureServer(t) + qwpDropTable(t, tableName) + defer qwpDropTable(t, tableName) + insertRows(t, tableName, 1) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, err := QwpQueryClientFromConf(ctx, "ws::addr="+qwpTestAddr+";") + if err != nil { + t.Fatalf("QwpQueryClientFromConf: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, fmt.Sprintf("SELECT v FROM '%s'", tableName)) + defer q.Close() + + var rows int + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + rows += batch.RowCount() + } + if rows != 1 { + t.Errorf("rows=%d, want 1", rows) + } +} + +// TestQwpIntegrationQueryMultipleBatches asks the server to stream a +// larger result set (enough rows to cross a batch boundary at the +// server-default batch cap). Uses WithQwpQueryMaxBatchRows to force +// multiple batches even when row counts are modest, and verifies the +// iterator yields them all in order. +func TestQwpIntegrationQueryMultipleBatches(t *testing.T) { + const tableName = "qwp_integ_query_multibatch" + qwpEnsureServer(t) + qwpDropTable(t, tableName) + defer qwpDropTable(t, tableName) + const totalRows = 50 + insertRows(t, tableName, totalRows) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(qwpTestAddr), + WithQwpQueryMaxBatchRows(10), + ) + if err != nil { + t.Fatalf("NewQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, fmt.Sprintf("SELECT v FROM '%s' ORDER BY v", tableName)) + defer q.Close() + + var rows, batches int + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + batches++ + n := batch.RowCount() + for r := 0; r < n; r++ { + want := int64(rows) + if got := batch.Int64(0, r); got != want { + t.Errorf("row %d (batch %d): got %d, want %d", rows, batches, got, want) + } + rows++ + } + } + if rows != totalRows { + t.Errorf("rows=%d, want %d", rows, totalRows) + } + if batches < 2 { + t.Errorf("batches=%d, want >=2 (max_batch_rows=10 with %d rows)", batches, totalRows) + } + if q.TotalRows() != int64(totalRows) { + t.Errorf("TotalRows=%d, want %d", q.TotalRows(), totalRows) + } +} + +// TestQwpIntegrationCompressedBatches round-trips a SELECT with +// compression=zstd against the live server. Verifies the accept- +// encoding handshake negotiates zstd (if the server supports it), +// every RESULT_BATCH's FLAG_ZSTD bit drives the decompression path, +// and the decoded values match what we ingested. Enough rows (50) to +// cross a batch boundary so at least one compressed batch is +// guaranteed to be non-trivial. +// +// When the server does not support zstd, the handshake falls back to +// raw per the accept-encoding semantics ("zstd;level=3,raw" lists raw +// as an acceptable alternative). The client still succeeds; this test +// just won't exercise the decompression path in that case. A log line +// calls out which branch ran so test output makes the coverage +// obvious. +func TestQwpIntegrationCompressedBatches(t *testing.T) { + const tableName = "qwp_integ_query_zstd" + qwpEnsureServer(t) + qwpDropTable(t, tableName) + defer qwpDropTable(t, tableName) + const totalRows = 50 + insertRows(t, tableName, totalRows) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + c, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(qwpTestAddr), + WithQwpQueryCompression(qwpCompressionZstd), + WithQwpQueryMaxBatchRows(10), + ) + if err != nil { + t.Fatalf("NewQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, fmt.Sprintf("SELECT v FROM '%s' ORDER BY v", tableName)) + defer q.Close() + + var rows, batches, compressedBatches int + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + batches++ + if len(batch.zstdScratch) > 0 { + compressedBatches++ + } + n := batch.RowCount() + for r := 0; r < n; r++ { + want := int64(rows) + if got := batch.Int64(0, r); got != want { + t.Errorf("row %d (batch %d): got %d, want %d", rows, batches, got, want) + } + rows++ + } + } + if rows != totalRows { + t.Errorf("rows=%d, want %d", rows, totalRows) + } + if q.TotalRows() != int64(totalRows) { + t.Errorf("TotalRows=%d, want %d", q.TotalRows(), totalRows) + } + if compressedBatches == 0 { + t.Logf("server accepted compression=zstd advertisement but sent no compressed batches (fell back to raw)") + } else { + t.Logf("%d of %d batches arrived zstd-compressed", compressedBatches, batches) + } +} + +// TestQwpIntegrationCancelLongRunningQuery submits a query that runs +// long enough to be interrupted, invokes Cancel from the iterating +// goroutine's defer, and verifies iteration ends cleanly (the +// server's CANCELLED echo is swallowed by the cursor's cancel-aware +// error path). Unlike older revisions that only checked `saw >= 1`, +// this test also verifies the post-cancel invariant that actually +// matters in production: the client's dispatcher returned to idle so +// a follow-up Query can round-trip without stranding. +// +// We deliberately do NOT assert that Cancel short-circuited the +// server: long_sequence streams tens of millions of rows per second +// on localhost and races past Cancel() before the cancel frame +// reaches the server. What we guarantee is (a) the iterator does not +// panic or hang, and (b) the client is reusable after the iteration +// ends — whichever side (cancel or natural RESULT_END) won the race. +func TestQwpIntegrationCancelLongRunningQuery(t *testing.T) { + qwpEnsureServer(t) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + // Small batches so the iterator enters the yield body before the + // server has finished streaming — otherwise saw stays 0 for fast + // queries. + c, err := NewQwpQueryClient(ctx, + WithQwpQueryAddress(qwpTestAddr), + WithQwpQueryMaxBatchRows(500), + ) + if err != nil { + t.Fatalf("NewQwpQueryClient: %v", err) + } + defer c.Close(ctx) + + q := c.Query(ctx, "SELECT x FROM long_sequence(10000000)") + + start := time.Now() + var saw int + for _, err := range q.Batches() { + if err != nil { + q.Close() + t.Fatalf("unexpected iter err: %v", err) + } + saw++ + if saw == 1 { + q.Cancel() + } + } + elapsed := time.Since(start) + q.Close() + + if saw < 1 { + t.Errorf("saw %d batches, want >= 1", saw) + } + // Cancel must not deadlock the iterator — 15s is generous for + // 10M rows on a local server whether the cancel short-circuits + // or the server finishes streaming naturally. + if elapsed > 15*time.Second { + t.Errorf("iteration took %v — suggests cancel-drain hung", elapsed) + } + + // Client must stay usable: a follow-up Query should round-trip + // cleanly whether the cancel or the natural RESULT_END won. This + // is the real production-visible property — a broken cancel-drain + // would leave the dispatcher stranded and the next Query would + // block forever on the single-slot requests channel. + q2 := c.Query(ctx, "SELECT 1") + var rows int + for batch, err := range q2.Batches() { + if err != nil { + q2.Close() + t.Fatalf("follow-up query err: %v", err) + } + rows += batch.RowCount() + } + q2.Close() + if rows != 1 { + t.Errorf("follow-up query rows=%d, want 1", rows) + } +} + +// TestQwpIntegrationCtxDeadlineMidStream exercises the other shutdown +// path through Batches(): the query's ctx expires while the iterator +// is blocked in takeEvent. The iterator must yield the ctx error once, +// then kick the dispatcher (cancel + drain on a fresh cleanup ctx) so +// the client stays usable. Complements the explicit-Cancel test above +// which exits via the !keepGoing break-out. +func TestQwpIntegrationCtxDeadlineMidStream(t *testing.T) { + c := newTestQueryClient(t) + clientCtx, clientCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer clientCancel() + defer c.Close(clientCtx) + + // A short ctx on the Query itself; long enough to establish the + // stream but short enough to expire mid-flight. The row count must + // give the deadline a wide window to land in: 100M int64 rows stream + // in ~1.2s (and linearly longer on slower CI), giving the 200ms deadline + // headroom on either end. + queryCtx, queryCancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer queryCancel() + q := c.Query(queryCtx, "SELECT x FROM long_sequence(100000000)") + + start := time.Now() + var iterErr error + var saw int + for batch, err := range q.Batches() { + if err != nil { + iterErr = err + break + } + saw++ + _ = batch + } + elapsed := time.Since(start) + q.Close() + + if iterErr == nil { + t.Fatal("expected ctx-deadline error from the iterator, got nil") + } + if !errors.Is(iterErr, context.DeadlineExceeded) { + t.Errorf("iter err = %v, want context.DeadlineExceeded", iterErr) + } + if elapsed > 15*time.Second { + t.Errorf("iteration took %v — ctx expiry did not unblock the iterator", elapsed) + } + _ = saw + + // Client-level ctx is still live; the dispatcher should be back to + // idle thanks to cancelAndDrainOnCleanupCtx. A follow-up query + // confirms we did not strand the connection. + q2 := c.Query(clientCtx, "SELECT 1") + var rows int + for batch, err := range q2.Batches() { + if err != nil { + q2.Close() + t.Fatalf("follow-up query err after ctx-expiry teardown: %v", err) + } + rows += batch.RowCount() + } + q2.Close() + if rows != 1 { + t.Errorf("follow-up rows=%d, want 1", rows) + } +} + +// TestQwpIntegrationClientCloseDuringLongQuery exercises the +// transport-teardown path: while a long-running SELECT is mid-stream, +// another goroutine closes the QwpQueryClient. The iterator must see a +// transport error (the read side fails once the WebSocket close frame +// lands) and exit without hanging. This is the closest we can get, in +// an integration test, to a server-initiated connection close — the +// local close also tears down the read direction and surfaces through +// the same code path. +// +// Does NOT read the batch's aliased slices after Close is called — the +// public contract explicitly flags that as undefined (the transport +// may free the underlying buffer). RowCount is safe because it reads +// an integer field, not a payload-backed slice. +func TestQwpIntegrationClientCloseDuringLongQuery(t *testing.T) { + c := newTestQueryClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + q := c.Query(ctx, "SELECT x FROM long_sequence(10000000)") + + start := time.Now() + closed := make(chan struct{}) + var saw int + var iterErr error + for batch, err := range q.Batches() { + if err != nil { + iterErr = err + break + } + saw++ + _ = batch.RowCount() + if saw == 1 { + go func() { + closeCtx, closeCancel := context.WithTimeout( + context.Background(), 5*time.Second) + defer closeCancel() + _ = c.Close(closeCtx) + close(closed) + }() + } + } + elapsed := time.Since(start) + q.Close() + + select { + case <-closed: + case <-time.After(10 * time.Second): + t.Fatal("client Close did not return within 10s of starting") + } + + if saw < 1 { + t.Errorf("saw %d batches before close, want >= 1", saw) + } + if iterErr == nil { + t.Error("expected the iterator to surface a transport error after client Close") + } + if elapsed > 15*time.Second { + t.Errorf("iteration took %v — client Close did not unblock the iterator", elapsed) + } +} + +// TestQwpIntegrationQueryWithBinds exercises the bind-variable path +// against the live server. Inserts a handful of rows, then runs the +// same filtered SELECT three times with different bind values and +// verifies the server returns the expected result for each set. Two +// goals: (a) confirm the bind wire payload is accepted by the server +// (no protocol mismatch with the Java / C client encoders), and (b) +// confirm repeated calls with the same SQL text produce the expected +// per-call result sets. +func TestQwpIntegrationQueryWithBinds(t *testing.T) { + const tableName = "qwp_integ_binds" + qwpEnsureServer(t) + qwpDropTable(t, tableName) + defer qwpDropTable(t, tableName) + + insertRows(t, tableName, 9) // host cycles through server0 / server1 / server2 + + c := newTestQueryClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + defer c.Close(ctx) + + sql := fmt.Sprintf( + "SELECT v FROM '%s' WHERE host = $1 AND v >= $2 ORDER BY v", tableName) + + type tc struct { + host string + minV int64 + wantVs []int64 + } + cases := []tc{ + // insertRows writes v=i with host="server{i%3}": + // server0: 0, 3, 6 + // server1: 1, 4, 7 + // server2: 2, 5, 8 + {host: "server0", minV: 0, wantVs: []int64{0, 3, 6}}, + {host: "server1", minV: 4, wantVs: []int64{4, 7}}, + {host: "server2", minV: 10, wantVs: nil}, + } + + for _, tc := range cases { + t.Run(tc.host, func(t *testing.T) { + q := c.Query(ctx, sql, WithQwpQueryBinds(func(b *QwpBinds) { + b.VarcharBind(0, tc.host).LongBind(1, tc.minV) + })) + defer q.Close() + + var got []int64 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("iter err: %v", err) + } + for r := 0; r < batch.RowCount(); r++ { + got = append(got, batch.Int64(0, r)) + } + } + if len(got) != len(tc.wantVs) { + t.Fatalf("got %d rows, want %d (values %v want %v)", + len(got), len(tc.wantVs), got, tc.wantVs) + } + for i, v := range got { + if v != tc.wantVs[i] { + t.Errorf("row %d: got v=%d, want %d", i, v, tc.wantVs[i]) + } + } + }) + } +} diff --git a/qwp_query_io.go b/qwp_query_io.go new file mode 100644 index 00000000..743daa2d --- /dev/null +++ b/qwp_query_io.go @@ -0,0 +1,1113 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "io" + "sync" + "sync/atomic" + + "github.com/coder/websocket" +) + +// qwpEventKind tags a qwpEvent produced by the egress I/O goroutine. +type qwpEventKind byte + +const ( + qwpEventKindBatch qwpEventKind = iota + 1 // RESULT_BATCH: batch field valid + qwpEventKindEnd // RESULT_END: totalRows valid + qwpEventKindExecDone // EXEC_DONE: execResult valid + // qwpEventKindError is the server's QUERY_ERROR frame. The + // connection is still healthy; the next query may submit on the + // same I/O goroutine. Surfaced to the user as *QwpQueryError. + qwpEventKindError + // qwpEventKindTransportError is a synthesized client-side terminal + // failure: reader closed, server closed, decoder out of sync, send + // failure, or unknown msg_kind. The connection's per-connection + // state is no longer trustworthy and the I/O goroutine has + // poisoned ioErr — every emission of this kind goes through + // poisonAndEmitError, so a follow-up submitQuery returns the + // original cause synchronously. Routed by the failover orchestrator + // to the reconnect-and-replay path; surfaced to the user as a + // plain error when failover is disabled or exhausted. + qwpEventKindTransportError + // qwpEventKindFailoverReset is emitted by the session orchestrator + // (not the I/O goroutine) on the consumer side after a successful + // reconnect and resubmit. Carries the new generation's + // QwpServerInfo so the user can discard accumulated rows from the + // prior connection. Internal to qwp_query_failover.go; + // qwp_query_io.go never produces this kind directly. + qwpEventKindFailoverReset +) + +// qwpEvent is the discriminated-union event carried on qwpEgressIO.events +// from the I/O goroutine to the user. Fields are valid only for the +// matching kind — see the constants above. +type qwpEvent struct { + kind qwpEventKind + requestId int64 + + // Batch kind + batch *qwpBatchBuffer + + // End kind + totalRows int64 + + // ExecDone kind + execResult ExecResult + + // Error kind — carries the server-reported QUERY_ERROR status + + // message. TransportError kind reuses errMessage; status is + // always 0 on the synthesized variant since it does not + // correspond to a server status byte. FailoverReset kind reuses + // failoverReset. + errStatus QwpStatusCode + errMessage string + + // TransportError kind — optional typed cause. When set, consumers + // wrap with %w so callers can errors.As against the underlying + // type (e.g. *QwpRoleMismatchError raised by a failed reconnect). + // Nil for I/O-goroutine-emitted transport errors that only carry + // a string message via poisonAndEmitError. + transportErr error + + // FailoverReset kind — populated by qwp_query_failover.go after a + // successful reconnect; carries through to the user as + // *QwpFailoverReset. Nil for any other kind. + failoverReset *QwpFailoverReset +} + +// qwpBatchBuffer is a pool-owned container for one decoded +// RESULT_BATCH. The I/O goroutine borrows a buffer from the pool before +// calling qwpQueryDecoder.decode into buf.batch; the user's consumer +// returns it to the pool via release() after processing. +// +// Lifetime: while the user holds the buffer, io.buffers is missing one +// slot. The I/O goroutine stops reading new frames once the pool is +// empty, providing natural backpressure against slow consumers. +type qwpBatchBuffer struct { + batch QwpColumnBatch + // payloadLen is the number of bytes the server spent on this batch + // (== len(payload)). Captured at decode time so release() can feed + // it to the credit-replenish counter when flow control is enabled. + payloadLen int + // requestId is the query this batch belongs to, stamped at decode + // time. release() compares it against io.creditRequestId so a late + // release cannot credit a different query's window. + requestId int64 + // io is the back-reference used by release() to return the buffer + // to its owning pool. + io *qwpEgressIO + // frameBuf is the recycled WS read buffer this batch's columns + // alias on the zero-copy raw path. Non-nil only while a raw batch + // is outstanding; releaseBuffer returns it to io.readBufPool and + // clears it. nil on the zstd path (columns alias zstdScratch) and + // on every error/orphan path (those let GC reclaim). + frameBuf *[]byte +} + +// release hands the buffer back to the I/O goroutine's free pool. Safe +// to call at most once per batch event; further calls have undefined +// buffer-ownership semantics (the decoder may already be writing into +// the batch). Non-blocking — the pool is sized so a live I/O goroutine +// always has exactly one free slot for each buffer currently held +// outside. +func (b *qwpBatchBuffer) release() { + b.io.releaseBuffer(b) +} + +// qwpRequest is a pending query submission handed from the user +// goroutine to the I/O goroutine via qwpEgressIO.requests. +type qwpRequest struct { + sql string + // requestId is the client-assigned 64-bit id echoed back on every + // frame for this query. The user assigns monotonically from a + // per-client counter (see step 8). + requestId int64 + // initialCredit is the server's send-ahead byte budget. 0 means + // "unbounded" — no CREDIT frames exchanged. A positive value + // opts the query into flow control: the server streams at most + // initialCredit bytes before parking, and the I/O goroutine + // replenishes by each batch's byte length after the consumer + // releases its buffer. + initialCredit int64 + // bindCount is the number of typed bind parameters encoded in + // bindPayload, or 0 when the query has no binds. + bindCount int + // bindPayload is the pre-encoded typed bind-parameter block for + // this query, or nil when bindCount == 0. Owned per request — + // buildRequest copies from QwpQueryClient's reusable bind scratch + // into this fresh slice before submitQuery, so a follow-up + // query's reset + re-encode cannot race the dispatcher. + bindPayload []byte +} + +// qwpEgressIO owns the WebSocket transport plus the per-connection +// decoder state, runs the dedicated receive + dispatch goroutines, +// and shuttles events to the consumer. +// +// Goroutine topology (two internal goroutines): +// +// - reader: blocks in conn.Read and pushes each incoming frame to +// frameCh. Never sees cancel / credit / user state. Exits when +// the server closes the connection or shutdown cancels readCtx. +// +// - dispatcher (aka the "main" I/O goroutine): selects on frameCh / +// notifyCh / shutdownCh, drains the cancel + credit atomics, +// dispatches frames to the decoder, and emits events to the user. +// +// This split is deliberate: coder/websocket closes the underlying TCP +// connection when a Read's context is cancelled mid-frame, so we can +// NOT use ctx cancellation as a "kick" signal to drain pending +// cancels. Instead, the dispatcher listens on notifyCh alongside +// frameCh, reacts to user-initiated state changes without touching +// the Read, and only cancels readCtx on final shutdown (when +// destroying the connection is acceptable). +// +// Lifecycle: newQwpEgressIO → start → (submitQuery → takeEvent... → +// release... [+ requestCancel])* → shutdown. +// +// Threading contract: +// - submitQuery, takeEvent, releaseBuffer: single user goroutine. +// Concurrent submitQueries / takeEvents are not guaranteed to be +// safe; Phase-1 supports one query in flight at a time. +// - requestCancel: any goroutine. +// - shutdown: any goroutine; idempotent. +// +// Not a public type — wrapped by QwpQueryClient in step 8. +type qwpEgressIO struct { + transport *qwpTransport + decoder qwpQueryDecoder + + // buffers is the free-buffer pool. The dispatcher takes one + // before decoding a RESULT_BATCH; the user returns it via + // release() after processing. Capacity == bufferPoolSize. + buffers chan *qwpBatchBuffer + + // readBufPool recycles the raw WS frame byte buffers the reader + // reads each message into. Replaces a per-frame io.ReadAll + // allocation inside coder/websocket.Conn.Read (the dominant + // egress allocation source). Holds *[]byte so the grown capacity + // survives reuse. sync.Pool (not a sized chan) keeps the + // prototype's ownership surface small: only the steady-state raw + // RESULT_BATCH path recycles; every error/orphan path simply + // drops its buffer and lets GC reclaim, which sync.Pool tolerates. + readBufPool sync.Pool + + // events carries all outbound events to the consumer. Capacity == + // bufferPoolSize+2 so a trailing End/Error after every buffered + // batch fits without blocking the producer. Closed by the + // dispatcher on exit so a consumer parked on takeEvent wakes with + // ok=false (rather than on a best-effort sentinel that could be + // dropped when the channel is full). + events chan qwpEvent + + // requests is the submission slot. Single-entry: Phase-1 assumes + // one query at a time. + requests chan qwpRequest + + // frameCh carries received frames from the reader to the + // dispatcher. Unbuffered: the reader blocks until the dispatcher + // is ready, which naturally backpressures the server via the + // TCP window. + frameCh chan qwpReaderEvent + + // notifyCh wakes the dispatcher when the user changes state + // (requestCancel, releaseBuffer). Buffered size 1 with a non- + // blocking send semantic (concurrent notifies coalesce): the + // dispatcher drains the atomic on the next loop iteration, so + // one pending notify always suffices to re-check. + notifyCh chan struct{} + + // cancelRequestId is the pending-cancel latch. requestCancel + // stores the to-be-cancelled requestId here; the dispatcher + // swaps it back to -1 at every loop boundary and sends a CANCEL + // frame if non-negative. + cancelRequestId atomic.Int64 + + // pendingCredit accumulates bytes to CREDIT-replenish on the next + // loop iteration. release() Adds; the dispatcher Swaps(0). Only + // consulted when creditEnabled. + pendingCredit atomic.Int64 + + // creditRequestId is the request_id whose CREDIT window pendingCredit + // is currently feeding — the dispatcher publishes it (atomically, + // since release() runs on the user goroutine) when it begins serving + // a query. releaseBuffer credits only a buffer whose own requestId + // still matches this, so a buffer released after its query ended and + // the next one started cannot pour stale bytes into the new query's + // window. + creditRequestId atomic.Int64 + + // ioCtx / ioCancel gate every conn-level I/O this struct owns — + // the reader's conn.Read and the dispatcher's conn.Write calls + // (sendQueryRequest / sendCancel / sendCredit). Cancelled on + // shutdown() to unblock both sides: cancelling tears down the + // underlying conn via coder/websocket's ctx-driven AfterFunc, + // which is fine at shutdown. + // + // Reusing the same ctx for both directions is deliberate. If only + // the reader's Read ctx is cancelled and the dispatcher is parked + // in Write on a peer that has stopped draining, Read's AfterFunc + // tears down rwc only while Read is active; between Reads (e.g. + // after the reader has consumed a frame and is parked on + // frameCh), the AfterFunc is unregistered and shutdown can't + // reach the dispatcher. The Write ctx closes that gap. + ioCtx context.Context + ioCancel context.CancelFunc + + // shutdownCh closes when shutdown() is called for the first time. + // doneCh closes when BOTH dispatcher and reader goroutines have + // exited — shutdown() blocks on doneCh, so a shutdown() that returns + // nil has fully joined both goroutines. A short-ctx shutdown() may + // instead return early via ctx.Done() with the goroutines still + // winding down; the transport teardown that follows stays race-free + // regardless because the conn field is immutable after connect (see + // qwpTransport.conn). + shutdownCh chan struct{} + doneCh chan struct{} + shutdownOnce sync.Once + shutdownWG sync.WaitGroup + // closed is set true right before the dispatcher returns so + // releaseBuffer can early-exit instead of attempting a send on a + // pool nobody reads from. + closed atomic.Bool + + // sendBuf is scratch for QUERY_REQUEST / CANCEL / CREDIT frames. + // Owned by the dispatcher; never aliased outside. + sendBuf qwpWireBuffer + + // Per-query state, accessed only from the dispatcher. + currentRequestId int64 + creditEnabled bool + currentQueryDone bool + + // ioErrMu guards ioErr. Set on the dispatcher goroutine from any + // decoder- or framing-level error path; read on the user goroutine + // from submitQuery. + ioErrMu sync.Mutex + // ioErr latches the first transport-class error for the life of + // this connection: any reader-error / server-close, send failure, + // decoder/framing desync, or unknown msg_kind. Once set, every + // subsequent submitQuery returns this error synchronously so a + // fresh query is never decoded against a desynced + // qwpConnDict / zstd stream — an undetectable + // subset of out-of-range reads could leave the dict accidentally + // in sync with the server (offsets match) while values are wrong, + // producing silently corrupted results — and never sent on a dead + // conn either. Mirrors the ingress send loop's latched terminal + // error (recordFatal / sendLoopCheckError in qwp_sf_send_loop.go). + ioErr error +} + +// qwpReaderEvent is what the reader goroutine hands to the dispatcher: +// either a successfully received binary frame (payload != nil, err == +// nil) or a read error (payload == nil, err != nil). Non-binary frames +// are dropped inside the reader. +type qwpReaderEvent struct { + payload []byte + // bufRef is the pooled buffer that backs payload, or nil for an + // error event / a payload not drawn from io.readBufPool. The + // dispatcher either hands it to the batch buffer (raw path, freed + // at releaseBuffer) or returns it to the pool immediately. + bufRef *[]byte + err error +} + +// newQwpEgressIO constructs an I/O controller attached to an already- +// connected transport. bufferPoolSize is the depth of the decode pool; +// must be >= 1. +func newQwpEgressIO(tr *qwpTransport, bufferPoolSize int) *qwpEgressIO { + if bufferPoolSize < 1 { + panic("qwp: bufferPoolSize must be >= 1") + } + ioCtx, ioCancel := context.WithCancel(context.Background()) + io := &qwpEgressIO{ + transport: tr, + buffers: make(chan *qwpBatchBuffer, bufferPoolSize), + events: make(chan qwpEvent, bufferPoolSize+2), + requests: make(chan qwpRequest, 1), + frameCh: make(chan qwpReaderEvent), + notifyCh: make(chan struct{}, 1), + ioCtx: ioCtx, + ioCancel: ioCancel, + shutdownCh: make(chan struct{}), + doneCh: make(chan struct{}), + } + io.readBufPool.New = func() any { + b := make([]byte, 0, 64*1024) + return &b + } + io.cancelRequestId.Store(-1) + io.currentRequestId = -1 + io.creditRequestId.Store(-1) + for i := 0; i < bufferPoolSize; i++ { + io.buffers <- &qwpBatchBuffer{io: io} + } + return io +} + +// start launches the dispatcher + reader goroutines. Must be called +// exactly once, before the first submitQuery. +// +// doneCh is closed by the WaitGroup-tracked wrapper once both +// goroutines have returned — not by the dispatcher alone — so a +// shutdown() that observes doneCh has joined the reader and dispatcher, +// not just the dispatcher. +func (io *qwpEgressIO) start() { + // Pin the decoder to the version the transport negotiated so + // parseFrameHeader rejects any server frame whose header version + // byte doesn't match (spec §3 strict-equality requirement). + io.decoder.negotiatedVersion = io.transport.negotiatedVersion + io.shutdownWG.Add(2) + go func() { + defer io.shutdownWG.Done() + io.dispatcherRun() + }() + go func() { + defer io.shutdownWG.Done() + io.readerRun() + }() + go func() { + io.shutdownWG.Wait() + close(io.doneCh) + }() +} + +// submitQuery hands the request to the I/O goroutine. Blocks if a +// prior query's submission has not yet been picked up (single-slot +// queue). Returns ctx.Err() on user cancellation, a sentinel error +// if the I/O goroutine has shut down, or the latched ioErr if a +// prior decoder/framing failure has poisoned the connection (a fresh +// submit would be decoded against desynced state). +func (io *qwpEgressIO) submitQuery(ctx context.Context, req qwpRequest) error { + if err := io.loadIoErr(); err != nil { + return err + } + // Non-blocking shutdown check first: if shutdownCh is already + // closed, Go's select would otherwise non-deterministically pick + // the buffered requests slot, leaving the request to rot after + // the dispatcher has already returned. + select { + case <-io.shutdownCh: + return errors.New("qwp: I/O goroutine shut down") + default: + } + select { + case io.requests <- req: + return nil + case <-io.shutdownCh: + return errors.New("qwp: I/O goroutine shut down") + case <-ctx.Done(): + return ctx.Err() + } +} + +// setIoErr latches err as the connection's terminal ioErr — first +// writer wins. Called by the dispatcher (always via +// poisonAndEmitError) on any transport-class fault so subsequent +// submitQuery calls fail immediately rather than running a fresh +// query against a dead conn or desynced decoder. +func (io *qwpEgressIO) setIoErr(err error) { + io.ioErrMu.Lock() + defer io.ioErrMu.Unlock() + if io.ioErr == nil { + io.ioErr = err + } +} + +// loadIoErr returns the latched terminal error, or nil if none. +func (io *qwpEgressIO) loadIoErr() error { + io.ioErrMu.Lock() + defer io.ioErrMu.Unlock() + return io.ioErr +} + +// takeEvent pops the next event. Blocks until one arrives or ctx is +// cancelled. Returns a terminal error once the dispatcher has exited +// and its events channel is both drained and closed — so a consumer +// with a long-lived ctx always wakes after shutdown without having to +// rely on a best-effort sentinel. +func (io *qwpEgressIO) takeEvent(ctx context.Context) (qwpEvent, error) { + select { + case ev, ok := <-io.events: + if !ok { + return qwpEvent{}, errors.New("qwp: I/O goroutine terminated") + } + return ev, nil + case <-ctx.Done(): + return qwpEvent{}, ctx.Err() + } +} + +// requestCancel marks a CANCEL frame as pending for requestId. Safe +// to call from any goroutine; coalesces with prior pending cancels +// (the newer id wins). Wakes the dispatcher so the cancel reaches +// the wire without waiting for the next server frame. +func (io *qwpEgressIO) requestCancel(requestId int64) { + io.cancelRequestId.Store(requestId) + io.notify() +} + +// releaseBuffer returns a batch buffer to the free pool after the user +// handler is done with it. Must be called exactly once per KIND_BATCH +// event. Non-blocking. +func (io *qwpEgressIO) releaseBuffer(buf *qwpBatchBuffer) { + // Recycle the raw frame buffer this batch aliased (raw path only; + // nil on zstd / error paths). Safe here: the io.events send/recv + // that delivered buf, and the io.buffers handoff that precedes the + // next decode into it, serialize all access to buf.frameBuf, so + // this never races the dispatcher. Done before the closed check so + // the buffer is reclaimed even on a late release after shutdown. + if fb := buf.frameBuf; fb != nil { + buf.frameBuf = nil + io.readBufPool.Put(fb) + } + if io.closed.Load() { + // I/O goroutine is gone; the buffer's backing []byte will be + // reclaimed by Go's GC once the user drops their reference. + return + } + // Queue the bytes for credit replenish before returning the buffer + // so the next dispatcher loop iteration's drainPendingCredit sees + // the latest counter. When creditEnabled is false, the dispatcher + // discards the counter; when true, it sends a CREDIT frame for + // the accumulated bytes. + // + // Only credit when this buffer still belongs to the query the + // dispatcher is serving. A buffer released after its query ended — + // and a new query already started — would otherwise add its bytes to + // the new query's window. Crediting a finished query is itself moot + // (the dispatcher zeroes pendingCredit when it starts the next one), + // so skipping the stale add is always correct. + if buf.requestId == io.creditRequestId.Load() { + io.pendingCredit.Add(int64(buf.payloadLen)) + } + select { + case io.buffers <- buf: + default: + // Unreachable in practice: io.buffers has capacity + // bufferPoolSize and at most bufferPoolSize buffers exist, so + // a release can never overflow it. Non-blocking defensively — + // if a double-release or similar accounting bug ever fills the + // pool, we'd rather drop the extra buffer than deadlock here. + } + // Wake the dispatcher so the credit replenish (if flow control is + // on) reaches the server without waiting for the next server- + // initiated frame. Harmless when credit is disabled — the + // dispatcher just re-enters its select. + io.notify() +} + +// recycleReadBuf returns a reader-owned pooled frame buffer to the +// pool. nil-safe: error events and payloads not drawn from +// io.readBufPool carry a nil bufRef. Called on the dispatcher +// goroutine for every frame whose decode does not transfer buffer +// ownership to a batch buffer (i.e. everything but a raw RESULT_BATCH). +func (io *qwpEgressIO) recycleReadBuf(bufRef *[]byte) { + if bufRef != nil { + io.readBufPool.Put(bufRef) + } +} + +// shutdown signals both goroutines to exit and blocks until the +// dispatcher returns or ctx expires. Idempotent — repeated calls +// return immediately once the dispatcher has joined. +func (io *qwpEgressIO) shutdown(ctx context.Context) error { + io.shutdownOnce.Do(func() { + close(io.shutdownCh) + // Cancel the shared I/O ctx. coder/websocket tears down the + // underlying TCP when an active Read or Write's ctx is + // cancelled — acceptable here because we are destroying the + // connection anyway. Cancelling both directions matters: if + // the dispatcher is parked inside conn.Write on a peer that + // has stopped draining and the reader is not currently inside + // conn.Read, only the Write's own ctx can unstick it. + io.ioCancel() + }) + select { + case <-io.doneCh: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// notify signals the dispatcher that user state (cancel atomic, +// credit atomic, pool) has changed. Non-blocking: if a notify is +// already pending, the dispatcher will still re-check both atomics +// in its next iteration, so we coalesce. +func (io *qwpEgressIO) notify() { + select { + case io.notifyCh <- struct{}{}: + default: + } +} + +// qwpReadFrameInto reads one complete WebSocket message from r into the +// recycled buffer *pb, reusing its capacity across frames (the whole +// point of the pool — it replaces coder/websocket.Conn.Read's per-frame +// io.ReadAll). It doubles *pb when a frame exceeds the current capacity +// and writes the grown slice back so the larger capacity persists for +// the next reuse. coder/websocket requires the message reader be drained +// to io.EOF. +// +// Growth is hard-capped at qwpMaxFrameReadLimit. The shared conn's +// SetReadLimit already cuts a runaway frame off mid-stream, but this +// independent ceiling keeps the function self-protecting regardless of +// connection config: it never allocates past the cap, and a frame that +// fills it without ending is rejected rather than grown further. +func qwpReadFrameInto(r io.Reader, pb *[]byte) ([]byte, error) { + b := (*pb)[:0] + for { + if len(b) == cap(b) { + if cap(b) >= qwpMaxFrameReadLimit { + *pb = b + return nil, fmt.Errorf( + "qwp: inbound frame exceeds %d-byte read limit", + qwpMaxFrameReadLimit) + } + nc := cap(b) * 2 + if nc < 64*1024 { + nc = 64 * 1024 + } + if nc > qwpMaxFrameReadLimit { + nc = qwpMaxFrameReadLimit + } + nb := make([]byte, len(b), nc) + copy(nb, b) + b = nb + } + n, err := r.Read(b[len(b):cap(b)]) + b = b[:len(b)+n] + if errors.Is(err, io.EOF) { + *pb = b + return b, nil + } + if err != nil { + *pb = b + return nil, err + } + } +} + +// qwpSameBacking reports whether a and b share a backing array. Used to +// distinguish the zero-copy raw decode path (batch columns alias the +// frame buffer) from the zstd path (they alias the batch's own +// zstdScratch). Robust across buffer reuse, unlike a zstdScratch-length +// probe, since zstdScratch persists on a recycled qwpBatchBuffer. +func qwpSameBacking(a, b []byte) bool { + return len(a) > 0 && len(b) > 0 && &a[0] == &b[0] +} + +// readerRun is the reader goroutine's top-level loop. It does nothing +// but pull binary frames off the WebSocket and hand them to the +// dispatcher via frameCh. Never looks at cancel / credit / user state +// — kept minimal so a blocked Read stays out of the dispatch-side +// fast path. +// +// Exits when either (a) conn.Reader returns an error (server close, +// malformed frame, or shutdown-cancelled readCtx), or (b) the +// dispatcher is shut down. Closes frameCh on the way out so the +// dispatcher's select sees EOF. +func (io *qwpEgressIO) readerRun() { + defer close(io.frameCh) + // Capture the conn once. The transport assigns it before start() + // launches this goroutine and never mutates it again, so reading it + // a single time here keeps this loop off the transport's fields — a + // concurrent close() tearing the connection down cannot race it. + conn := io.transport.conn + for { + msgType, r, err := conn.Reader(io.ioCtx) + if err != nil { + select { + case io.frameCh <- qwpReaderEvent{err: err}: + case <-io.shutdownCh: + } + return + } + pb := io.readBufPool.Get().(*[]byte) + payload, rerr := qwpReadFrameInto(r, pb) + if rerr != nil { + io.readBufPool.Put(pb) + select { + case io.frameCh <- qwpReaderEvent{err: rerr}: + case <-io.shutdownCh: + } + return + } + if msgType != websocket.MessageBinary { + // Tolerate stray text frames (keep-alives from misbehaving + // proxies) — same policy as readAck. + io.readBufPool.Put(pb) + continue + } + select { + case io.frameCh <- qwpReaderEvent{payload: payload, bufRef: pb}: + case <-io.shutdownCh: + io.readBufPool.Put(pb) + return + } + } +} + +// dispatcherRun is the dispatch goroutine's top-level loop. Exiting +// just decrements the shutdown WaitGroup — doneCh is closed by the +// start() wrapper only after the reader also exits, so a shutdown() +// that observes doneCh has joined both goroutines. +func (io *qwpEgressIO) dispatcherRun() { + // Defers run LIFO: close(events) first, then closed.Store(true). + // Either order is safe because a consumer that wakes on the + // closed channel and immediately calls releaseBuffer will + // observe closed=true momentarily — releaseBuffer's fallback + // path (non-blocking send + coalesced notify) is harmless even + // on a drained, dead pool. Keeping close first also keeps the + // reader/dispatcher invariant that events is closed before the + // waitgroup-gated doneCh fires in start(). + defer io.closed.Store(true) + defer close(io.events) + // Release decoder-owned resources (zstd decompression goroutines + // in particular) before the dispatcher itself exits. Runs LIFO + // relative to the defers above, which is the order we want: the + // last consumer that may wake on the closed events channel has + // already seen its terminal signal by the time decoder.close() + // tears down zstd state. + defer io.decoder.close() + + for { + var req qwpRequest + select { + case <-io.shutdownCh: + return + case req = <-io.requests: + } + + io.currentRequestId = req.requestId + // Publish the credit-attribution id before any buffer for this + // query can be released, so a release() on the user goroutine + // compares against the right query. + io.creditRequestId.Store(req.requestId) + io.creditEnabled = req.initialCredit > 0 + io.currentQueryDone = false + // Drop any schema held for a prior query. The egress schema + // rides only the first batch (batch_seq == 0) of each query + // response; resetting here guarantees this query parses its own + // schema before any continuation batch reuses it. + io.decoder.resetQuerySchema() + // Clear a lingering prior-query cancel without clobbering a + // user-thread Cancel(req.requestId) that raced the dispatcher + // picking up this request off the single-slot queue. The user + // can call QwpQuery.Cancel() as soon as Query() returns — + // submitQuery is non-blocking, so the user's Cancel can reach + // the atomic before the dispatcher even starts processing. + // CAS loop: only clear if the stored id is a prior-query id + // (not -1, not req.requestId). Any user Store that races the + // CAS either commits first (we see req.requestId and bail) or + // overwrites our -1 afterwards (drainPendingCancel picks it + // up on the next loop iteration either way). + for { + cur := io.cancelRequestId.Load() + if cur == -1 || cur == req.requestId { + break + } + if io.cancelRequestId.CompareAndSwap(cur, -1) { + break + } + } + io.pendingCredit.Store(0) + + if err := io.sendQueryRequest(req); err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: send QUERY_REQUEST: %v", err)) + continue + } + + io.receiveLoop() + } +} + +// receiveLoop dispatches frames until currentQueryDone or shutdown. +// Drains the cancel + credit latches at every iteration so user- +// initiated signals reach the server at loop boundaries; notifyCh +// wakes the select when those atomics change while we are waiting +// for a server frame. +func (io *qwpEgressIO) receiveLoop() { + for !io.currentQueryDone { + select { + case <-io.shutdownCh: + return + default: + } + + if !io.drainPendingCancel() { + return + } + if !io.drainPendingCredit() { + return + } + + select { + case <-io.shutdownCh: + return + case <-io.notifyCh: + // State change — loop back to drain. This is how a + // user-initiated cancel or release reaches the wire + // without waiting for a server frame. + case ev, ok := <-io.frameCh: + if !ok { + // Reader goroutine exited without emitting an error + // — unusual, but treat as a clean close of an + // in-flight query. + io.poisonAndEmitError("qwp: reader closed without error") + io.currentQueryDone = true + return + } + if ev.err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: server closed connection: %v", ev.err)) + io.currentQueryDone = true + return + } + io.dispatchFrame(ev) + } + } +} + +// dispatchFrame routes a received frame to the matching decoder method +// and emits the resulting event. Sets currentQueryDone on terminal +// frames (End / ExecDone / Error) so the receive loop exits. +func (io *qwpEgressIO) dispatchFrame(ev qwpReaderEvent) { + payload := ev.payload + kind, err := qwpPeekMsgKind(payload) + if err != nil { + // Header parse failure — we have no trustworthy framing, so + // poison the connection before emitting. + io.poisonAndEmitError(fmt.Sprintf("qwp: %v", err)) + io.currentQueryDone = true + io.recycleReadBuf(ev.bufRef) + return + } + if kind == qwpMsgKindResultBatch { + // RESULT_BATCH columns may alias the pooled frame buffer on the + // raw path, so ownership of ev.bufRef passes to handleResultBatch, + // which hands it to the batch buffer and recycles it in + // releaseBuffer once the consumer is done. + io.handleResultBatch(payload, ev.bufRef) + return + } + // Every other frame kind is parsed synchronously below and copies + // out whatever it retains (decodeQueryError copies its message; the + // rest return scalars), so the pooled frame buffer is dead the + // moment the handler returns — recycle it rather than dropping it to + // GC, which on query-heavy workloads would undo the read-buffer pool. + defer io.recycleReadBuf(ev.bufRef) + switch kind { + case qwpMsgKindResultEnd: + io.handleResultEnd(payload) + case qwpMsgKindQueryError: + io.handleQueryError(payload) + case qwpMsgKindExecDone: + io.handleExecDone(payload) + case qwpMsgKindCacheReset: + io.handleCacheReset(payload) + default: + // Unknown msg_kind means we are talking to a server whose + // protocol we do not understand — treat as terminal so we do + // not parade a desynced stream to the next query. + io.poisonAndEmitError(fmt.Sprintf("qwp: unknown msg_kind 0x%02X", byte(kind))) + io.currentQueryDone = true + } +} + +// handleResultBatch takes a buffer from the pool, decodes in place, +// and emits a batch event. Blocks on the pool when full. The select +// also watches shutdown + notify so a user-initiated cancel still +// reaches the wire while we wait for the handler to free up a buffer. +func (io *qwpEgressIO) handleResultBatch(payload []byte, bufRef *[]byte) { + var buf *qwpBatchBuffer + for buf == nil { + select { + case <-io.shutdownCh: + io.currentQueryDone = true + return + case buf = <-io.buffers: + case <-io.notifyCh: + // Handler moved the cancel / credit state forward — + // flush whatever is pending before continuing the wait. + if !io.drainPendingCancel() { + return + } + if !io.drainPendingCredit() { + return + } + } + } + + if err := io.decoder.decode(payload, &buf.batch); err != nil { + // Decoder failed mid-frame: dict/registry state may be out + // of sync with the server. Return the buffer, poison the + // connection so the next submitQuery fails immediately + // (self-correction via the delta-dict sync check is + // probabilistic — a mis-advanced reader can leave the dict + // *accidentally* in sync at the offset level while values + // are wrong, producing silently corrupt rows), surface the + // error, and stop the query. + io.buffers <- buf + io.poisonAndEmitError(fmt.Sprintf("qwp: decode: %v", err)) + io.currentQueryDone = true + return + } + buf.payloadLen = len(payload) + buf.requestId = io.currentRequestId + if bufRef != nil && qwpSameBacking(payload, buf.batch.payload) { + // Raw (non-zstd) path: decode() left the batch's column slices + // aliasing our pooled frame buffer, so it must stay intact + // until the user is done. Hand ownership to the batch buffer; + // releaseBuffer returns it to readBufPool. + buf.frameBuf = bufRef + } else { + // zstd path: columns alias buf.batch.zstdScratch, so the frame + // buffer is dead the moment decode() returns — recycle now. + // (Also the defensive no-ref case: nothing to recycle.) + if bufRef != nil { + io.readBufPool.Put(bufRef) + } + buf.frameBuf = nil + } + + select { + case <-io.shutdownCh: + // Buffer is orphaned to GC here rather than returned to the + // pool: shutdown is racing the events send, the dispatcher is + // about to exit, and nobody will drain io.buffers anyway. The + // always-balanced bookkeeping the pool comment describes + // applies to the steady state, not to this terminal path. + io.currentQueryDone = true + return + case io.events <- qwpEvent{ + kind: qwpEventKindBatch, + requestId: io.currentRequestId, + batch: buf, + }: + } +} + +// handleResultEnd parses RESULT_END, emits an End event, and marks the +// current query done. Parse failure is emitted as a synthesized error. +func (io *qwpEgressIO) handleResultEnd(payload []byte) { + reqId, total, err := io.decoder.decodeResultEnd(payload) + if err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: %v", err)) + } else { + io.emit(qwpEvent{ + kind: qwpEventKindEnd, + requestId: reqId, + totalRows: total, + }) + } + io.currentQueryDone = true +} + +// handleQueryError parses QUERY_ERROR, emits an Error event with the +// server's status + message, and marks the query done. +func (io *qwpEgressIO) handleQueryError(payload []byte) { + qe, err := io.decoder.decodeQueryError(payload) + if err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: %v", err)) + } else { + io.emit(qwpEvent{ + kind: qwpEventKindError, + requestId: qe.RequestId, + errStatus: qe.Status, + errMessage: qe.Message, + }) + } + io.currentQueryDone = true +} + +// handleCacheReset parses CACHE_RESET and applies the requested reset +// to the decoder's connection-scoped caches. No user-visible event is +// emitted and the current query is NOT marked done — the server emits +// CACHE_RESET between queries (after the prior query's terminal +// frame, before the next query's RESULT_BATCH), so handling it is +// invisible from the user's perspective. A truncated or otherwise +// malformed frame is terminal: the decoder's per-connection state +// cannot be trusted, so we poison the connection. +func (io *qwpEgressIO) handleCacheReset(payload []byte) { + mask, err := io.decoder.decodeCacheReset(payload) + if err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: %v", err)) + io.currentQueryDone = true + return + } + io.decoder.applyCacheReset(mask) +} + +// handleExecDone parses EXEC_DONE, emits an ExecDone event, and marks +// the query done. +func (io *qwpEgressIO) handleExecDone(payload []byte) { + reqId, result, err := io.decoder.decodeExecDone(payload) + if err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: %v", err)) + } else { + io.emit(qwpEvent{ + kind: qwpEventKindExecDone, + requestId: reqId, + execResult: result, + }) + } + io.currentQueryDone = true +} + +// drainPendingCancel flushes a pending CANCEL to the wire, if any. +// Returns false on send failure (emits the error and marks query +// done so the caller can exit the recv loop). +func (io *qwpEgressIO) drainPendingCancel() bool { + id := io.cancelRequestId.Swap(-1) + if id < 0 { + return true + } + if err := io.sendCancel(id); err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: send CANCEL: %v", err)) + io.currentQueryDone = true + return false + } + return true +} + +// drainPendingCredit flushes queued credit bytes to the server, if any +// and flow control is enabled. When creditEnabled is false, the counter +// is simply reset — user code may still call release() on an +// unbounded-credit query; the accumulation is harmless but we don't +// want a stale non-zero count to leak into the next (possibly +// flow-controlled) query. +func (io *qwpEgressIO) drainPendingCredit() bool { + if !io.creditEnabled { + io.pendingCredit.Store(0) + return true + } + bytes := io.pendingCredit.Swap(0) + if bytes <= 0 { + return true + } + if err := io.sendCredit(io.currentRequestId, bytes); err != nil { + io.poisonAndEmitError(fmt.Sprintf("qwp: send CREDIT: %v", err)) + io.currentQueryDone = true + return false + } + return true +} + +// sendQueryRequest builds and sends the QUERY_REQUEST frame. +// +// Wire layout: msg_kind(0x10) + request_id(int64 LE) + sql_len(varint) +// + sql(utf8) + initial_credit(varint) + bind_count(varint) + +// bind_payload(bindPayloadLen bytes, pre-encoded by QwpBinds). +func (io *qwpEgressIO) sendQueryRequest(req qwpRequest) error { + io.sendBuf.reset() + io.sendBuf.putByte(byte(qwpMsgKindQueryRequest)) + io.sendBuf.putInt64LE(req.requestId) + io.sendBuf.putString(req.sql) + io.sendBuf.putVarint(uint64(req.initialCredit)) + io.sendBuf.putVarint(uint64(req.bindCount)) + if req.bindCount > 0 && len(req.bindPayload) > 0 { + io.sendBuf.putBytes(req.bindPayload) + } + return io.transport.sendMessage(io.ioCtx, io.sendBuf.bytes()) +} + +// sendCancel builds and sends a CANCEL frame. Wire layout: +// msg_kind(0x14) + request_id(int64 LE). +func (io *qwpEgressIO) sendCancel(requestId int64) error { + io.sendBuf.reset() + io.sendBuf.putByte(byte(qwpMsgKindCancel)) + io.sendBuf.putInt64LE(requestId) + return io.transport.sendMessage(io.ioCtx, io.sendBuf.bytes()) +} + +// sendCredit builds and sends a CREDIT frame. Wire layout: +// msg_kind(0x15) + request_id(int64 LE) + additional_bytes(varint). +func (io *qwpEgressIO) sendCredit(requestId, additionalBytes int64) error { + io.sendBuf.reset() + io.sendBuf.putByte(byte(qwpMsgKindCredit)) + io.sendBuf.putInt64LE(requestId) + io.sendBuf.putVarint(uint64(additionalBytes)) + return io.transport.sendMessage(io.ioCtx, io.sendBuf.bytes()) +} + +// emit pushes an event to the consumer, aborting on shutdown to avoid +// stranding the I/O goroutine on an unresponsive consumer. The events +// channel's bufferPoolSize+2 capacity guarantees non-batch events always +// fit in the steady state, so the select hits the fast path. +// +// If shutdown wins the race, the event is silently dropped. This is +// acceptable because shutdown is always user-initiated (Close / +// QwpQuery.Close): any QUERY_ERROR or synthesized error that arrives in +// the same instant is for a query the caller is no longer waiting on, +// and after Close returns takeEvent reports "I/O goroutine terminated" +// rather than the lost event. Connection-state poisoning (via +// poisonAndEmitError → setIoErr) is independent of the emit and is +// preserved across the drop, so a follow-up submitQuery on the same +// client still surfaces the underlying failure. +func (io *qwpEgressIO) emit(ev qwpEvent) { + select { + case io.events <- ev: + case <-io.shutdownCh: + } +} + +// poisonAndEmitError latches msg as the connection's terminal ioErr +// AND emits it as the current query's TransportError event. The single +// entry point for every transport-class fault on the dispatcher path: +// reader-error / server-close, send failures (QUERY_REQUEST / CANCEL / +// CREDIT), decoder or framing failures that desync the per-connection +// state (symbol dict, current-query schema, zstd stream), and unknown +// msg_kinds. After any of those, the connection is unusable — the +// decoder may be silently out of sync (a mis-advanced reader can leave +// the dict accidentally aligned at the offset level while values are +// wrong, producing silently corrupt rows), or the conn itself is dead. +// The latched ioErr causes every subsequent submitQuery to return +// immediately with the original cause, matching the documented +// "I/O goroutine has poisoned ioErr" contract on +// qwpEventKindTransportError and Java's notifyTerminalFailure pattern. +// Does NOT flip currentQueryDone — callers that also need to terminate +// the current query set it where it belongs. +// +// The transport-error kind makes the failover orchestrator route this +// to the reconnect-and-replay path; without the kind split a server +// QUERY_ERROR would be indistinguishable from a decoder desync and the +// orchestrator would either retry SQL errors (wrong) or never retry +// transport faults (also wrong). +func (io *qwpEgressIO) poisonAndEmitError(msg string) { + io.setIoErr(errors.New(msg)) + io.emit(qwpEvent{ + kind: qwpEventKindTransportError, + requestId: io.currentRequestId, + errStatus: 0, + errMessage: msg, + }) +} + diff --git a/qwp_query_io_test.go b/qwp_query_io_test.go new file mode 100644 index 00000000..868c7dd9 --- /dev/null +++ b/qwp_query_io_test.go @@ -0,0 +1,1857 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "bufio" + "context" + "crypto/sha1" + "encoding/base64" + "encoding/binary" + "fmt" + "io" + "net" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "github.com/coder/websocket" +) + +// --- Mock server harness --- + +// qwpMockEgressConn is the test-side view of a client's WebSocket. +// Tests drive it imperatively: read a frame (typically QUERY_REQUEST / +// CANCEL / CREDIT), send a scripted response (RESULT_BATCH, +// RESULT_END, QUERY_ERROR, EXEC_DONE), close cleanly. +// +// version, when non-zero, is the QWP wire-protocol version the mock +// claims to have negotiated in X-QWP-Version. sendBinary rewrites the +// header version byte of every frame to this value before writing — +// the shared frame builders (writeQwpFrame, buildOneRowInt64Batch) +// stamp qwpVersion unconditionally, but the strict-equality check in +// qwpQueryDecoder.parseFrameHeader requires server frames to match +// the negotiated version. Tests leave version=0 to skip the rewrite +// (frames are already stamped qwpVersion); cluster mocks that stamp +// frames explicitly set it to qwpVersion. +type qwpMockEgressConn struct { + t *testing.T + conn *websocket.Conn + version byte +} + +// readBinary reads one binary frame from the client. Skips non-binary +// frames; fails the test on read error. +func (m *qwpMockEgressConn) readBinary(ctx context.Context) []byte { + m.t.Helper() + for { + typ, data, err := m.conn.Read(ctx) + if err != nil { + m.t.Fatalf("mock: read: %v", err) + } + if typ == websocket.MessageBinary { + return data + } + } +} + +// sendBinary sends one binary frame to the client. When m.version is +// non-zero, the frame's QWP header version byte (offset 4) is rewritten +// to that value first — see the type comment for the rationale. +func (m *qwpMockEgressConn) sendBinary(ctx context.Context, data []byte) { + m.t.Helper() + if m.version != 0 && len(data) > 4 { + data[4] = m.version + } + if err := m.conn.Write(ctx, websocket.MessageBinary, data); err != nil { + m.t.Fatalf("mock: write: %v", err) + } +} + +// newQwpMockEgressServer stands up an httptest WebSocket server that +// hands control to `handler` once upgraded. handler is expected to +// perform the test-side request/response choreography, then return. +// The server stamps X-QWP-Version=1 so transport.connect accepts the +// upgrade. +func newQwpMockEgressServer(t *testing.T, handler func(*qwpMockEgressConn)) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + t.Logf("mock: accept: %v", err) + return + } + defer conn.CloseNow() + // The real server emits SERVER_INFO as the first post-upgrade + // frame and the egress client reads it during connect (both + // connectEgress and NewQwpQueryClient set serverInfoTimeout > 0). + // Mirror that here so connect() does not block waiting for it. + info := buildServerInfoFrame(qwpVersion, 0, qwpRolePrimary, 1, 0, + 1_700_000_000_000_000_000, "test-cluster", "mock-node") + if err := conn.Write(r.Context(), websocket.MessageBinary, info); err != nil { + t.Logf("mock: SERVER_INFO write: %v", err) + return + } + handler(&qwpMockEgressConn{t: t, conn: conn}) + })) +} + +// connectEgress dials the mock server with qwpReadPath. It sets a +// SERVER_INFO read timeout so the transport consumes the frame the mock +// emits post-upgrade, matching the production egress connect path. +func connectEgress(t *testing.T, url string) *qwpTransport { + t.Helper() + var tr qwpTransport + wsURL := "ws" + strings.TrimPrefix(url, "http") + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{ + endpointPath: qwpReadPath, + serverInfoTimeout: 2 * time.Second, + }); err != nil { + t.Fatalf("connect: %v", err) + } + return &tr +} + +// --- Frame builders (reuse decoder_test.go helpers where possible) --- + +// buildOneRowInt64Batch produces a RESULT_BATCH frame with a single +// column (wireType=LONG), one row, value=val. Uses the real encoder +// so the decoder exercises the positive path. +func buildOneRowInt64Batch(t *testing.T, requestId int64, batchSeq uint64, colName string, val int64) []byte { + t.Helper() + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn(colName, qwpTypeLong, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addLong(val) + tb.commitRow() + var enc qwpEncoder + return wrapAsResultBatch(enc.encodeTable(tb), requestId, batchSeq) +} + +// buildOneRowVarcharBatch produces a RESULT_BATCH frame with a single +// column (wireType=VARCHAR), one row, value=val. Used by the aliasing +// test, which needs a column type whose accessor returns bytes that +// alias directly into the per-frame payload. +func buildOneRowVarcharBatch(t *testing.T, requestId int64, batchSeq uint64, colName string, val string) []byte { + t.Helper() + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn(colName, qwpTypeVarchar, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addString(val) + tb.commitRow() + var enc qwpEncoder + return wrapAsResultBatch(enc.encodeTable(tb), requestId, batchSeq) +} + +// --- Parsers for frames sent by the client to the mock server --- + +// parseQueryRequest decodes a client-sent QUERY_REQUEST frame. Egress +// control frames (QUERY_REQUEST / CANCEL / CREDIT) sent by the client +// carry no 12-byte QWP header — they begin with the msg_kind byte +// directly. Returns (requestId, sql, initialCredit). +func parseQueryRequest(t *testing.T, frame []byte) (int64, string, int64) { + t.Helper() + if len(frame) < 1+8 { + t.Fatalf("QUERY_REQUEST frame too short: %d", len(frame)) + } + if kind := frame[0]; kind != byte(qwpMsgKindQueryRequest) { + t.Fatalf("expected msg_kind 0x10, got 0x%02X", kind) + } + p := 1 + requestId := int64(binary.LittleEndian.Uint64(frame[p:])) + p += 8 + sqlLen, n, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad sql_len varint: %v", err) + } + p += n + sql := string(frame[p : p+int(sqlLen)]) + p += int(sqlLen) + credit, n, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad credit varint: %v", err) + } + p += n + if _, _, err := qwpReadVarint(frame[p:]); err != nil { + t.Fatalf("bad bind_count varint: %v", err) + } + return requestId, sql, int64(credit) +} + +// --- Tests --- + +// TestQwpEgressIOHappyPathSelect drives a SELECT-style sequence: the +// mock sends RESULT_BATCH + RESULT_BATCH + RESULT_END; the I/O loop +// decodes and surfaces Batch, Batch, End in order. +func TestQwpEgressIOHappyPathSelect(t *testing.T) { + const wantSQL = "SELECT * FROM trades" + const wantReqID = int64(42) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + req := m.readBinary(ctx) + reqID, sql, credit := parseQueryRequest(t, req) + if reqID != wantReqID { + t.Errorf("server saw requestId=%d, want %d", reqID, wantReqID) + } + if sql != wantSQL { + t.Errorf("server saw sql=%q, want %q", sql, wantSQL) + } + if credit != 0 { + t.Errorf("server saw credit=%d, want 0", credit) + } + + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 100)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 1, "v", 200)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(wantReqID, 1, 2))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 4) + io.start() + defer shutdownIO(t, io) + + submitCtx, submitCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer submitCancel() + if err := io.submitQuery(submitCtx, qwpRequest{sql: wantSQL, requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + values := drainBatchesToEnd(t, io, 2 /* expect 2 batches */) + if len(values) != 2 || values[0] != 100 || values[1] != 200 { + t.Fatalf("batch values = %v, want [100 200]", values) + } +} + +// TestQwpEgressIOExecDone verifies the non-SELECT path: the server +// replies with EXEC_DONE and the I/O loop emits an ExecDone event. +func TestQwpEgressIOExecDone(t *testing.T) { + const wantReqID = int64(7) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(wantReqID, 0x04, 99))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "INSERT INTO t VALUES (1)", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindExecDone { + t.Fatalf("event kind = %v, want ExecDone (errMsg=%q)", ev.kind, ev.errMessage) + } + if ev.execResult.OpType != 0x04 { + t.Errorf("OpType = 0x%02X, want 0x04", ev.execResult.OpType) + } + if ev.execResult.RowsAffected != 99 { + t.Errorf("RowsAffected = %d, want 99", ev.execResult.RowsAffected) + } + if ev.requestId != wantReqID { + t.Errorf("requestId = %d, want %d", ev.requestId, wantReqID) + } +} + +// TestQwpEgressIOQueryError exercises the server-side-error path. +func TestQwpEgressIOQueryError(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody(1, byte(QwpStatusParseError), "bad sql", -1))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "BAD", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindError { + t.Fatalf("event kind = %v, want Error", ev.kind) + } + if ev.errStatus != QwpStatusParseError { + t.Errorf("errStatus = 0x%02X, want 0x%02X", byte(ev.errStatus), byte(QwpStatusParseError)) + } + if ev.errMessage != "bad sql" { + t.Errorf("errMessage = %q, want %q", ev.errMessage, "bad sql") + } + if ev.requestId != 1 { + t.Errorf("requestId = %d, want 1", ev.requestId) + } +} + +// TestQwpEgressIOCancel checks that requestCancel from a second +// goroutine produces a CANCEL frame on the wire before the query +// terminates. The mock pretends to be a streaming server: it sends one +// batch, waits for the client's CANCEL, then ends with QUERY_ERROR +// CANCELLED so the I/O loop exits cleanly. +func TestQwpEgressIOCancel(t *testing.T) { + const wantReqID = int64(5) + cancelSeen := make(chan int64, 1) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 7)) + + // Wait for CANCEL. Client control frames have no QWP header — + // they are just msg_kind + body. + frame := m.readBinary(ctx) + if kind := frame[0]; kind != byte(qwpMsgKindCancel) { + t.Errorf("server expected CANCEL, got msg_kind=0x%02X", kind) + } + cid := int64(binary.LittleEndian.Uint64(frame[1:])) + cancelSeen <- cid + + m.sendBinary(ctx, writeQwpFrame(0, buildQueryErrorBody(wantReqID, byte(qwpStatusCancelled), "cancelled", -1))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "SELECT 1", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // Receive the first batch, release it. + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindBatch { + t.Fatalf("event kind = %v, want Batch", ev.kind) + } + ev.batch.release() + + // Cancel from a separate goroutine; the I/O loop should flush + // CANCEL on the next loop iteration. + go io.requestCancel(wantReqID) + + select { + case gotID := <-cancelSeen: + if gotID != wantReqID { + t.Errorf("server saw cancel id=%d, want %d", gotID, wantReqID) + } + case <-time.After(2 * time.Second): + t.Fatal("server never saw CANCEL frame") + } + + // Server follows up with QUERY_ERROR/CANCELLED to close out. + ev = takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindError { + t.Fatalf("event kind = %v, want Error", ev.kind) + } + if ev.errStatus != qwpStatusCancelled { + t.Errorf("errStatus = 0x%02X, want 0x%02X (CANCELLED)", byte(ev.errStatus), byte(qwpStatusCancelled)) + } +} + +// TestQwpEgressIOShutdownUnblocksRead forces shutdown while the I/O +// goroutine is parked on a Read with no traffic. The goroutine must +// exit within a short grace period — demonstrating the ctx-cancel +// kick wakes the Read. +func TestQwpEgressIOShutdownUnblocksRead(t *testing.T) { + ready := make(chan struct{}) + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + close(ready) + // Sleep — don't reply. Client will shutdown. + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + <-ready // I/O loop is now inside readBinaryFrame. + + // Shutdown must unblock the Read promptly. + shutCtx, shutCancel := context.WithTimeout(context.Background(), 1*time.Second) + defer shutCancel() + start := time.Now() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown: %v", err) + } + if dt := time.Since(start); dt > 500*time.Millisecond { + t.Errorf("shutdown took %v (expected <500ms)", dt) + } +} + +// TestQwpEgressIOPoolBackpressure sizes the buffer pool to 1 and has +// the server emit two batches back-to-back. The I/O loop must not +// emit the second batch event until the user releases the first — +// the classic pool-exhaustion case. +func TestQwpEgressIOPoolBackpressure(t *testing.T) { + const wantReqID = int64(3) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 10)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 1, "v", 20)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(wantReqID, 1, 2))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 1) // pool of size 1 + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // First batch arrives promptly. + ev1 := takeEventOrFail(t, io, 2*time.Second) + if ev1.kind != qwpEventKindBatch { + t.Fatalf("ev1 kind = %v", ev1.kind) + } + + // Second batch must NOT arrive until we release the first — the + // I/O goroutine is parked in handleResultBatch waiting on the + // pool. A short poll of takeEvent confirms nothing pending. + shortCtx, shortCancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + if _, err := io.takeEvent(shortCtx); err == nil { + shortCancel() + t.Fatal("event arrived while pool was exhausted") + } + shortCancel() + + // Release and then the second batch + end should follow. + val1 := ev1.batch.batch.Int64(0, 0) + ev1.batch.release() + + ev2 := takeEventOrFail(t, io, 2*time.Second) + if ev2.kind != qwpEventKindBatch { + t.Fatalf("ev2 kind = %v", ev2.kind) + } + val2 := ev2.batch.batch.Int64(0, 0) + ev2.batch.release() + + ev3 := takeEventOrFail(t, io, 2*time.Second) + if ev3.kind != qwpEventKindEnd { + t.Fatalf("ev3 kind = %v, errMsg=%q", ev3.kind, ev3.errMessage) + } + if val1 != 10 || val2 != 20 { + t.Fatalf("batch values = %d, %d; want 10, 20", val1, val2) + } +} + +// TestQwpEgressIOInPlaceDecodeAliasing pins the cross-batch isolation +// invariant: while the user holds batch N, the dispatcher decoding +// batch N+1 into a DIFFERENT pool buffer must not corrupt batch N's +// view. VARCHAR makes the property visible — its accessor returns a +// byte slice aliased into the frame's payload, so any cross-buffer +// clobber would surface as wrong bytes on a re-read. +// +// In the Go architecture each qwpBatchBuffer holds its own +// QwpColumnBatch with per-batch layouts, and coder/websocket hands +// the dispatcher a fresh []byte per binary frame; holding a buffer +// pins that frame's payload via the layout's aliased slices. This +// test is the negative case the existing CopyAll-survives-pool-reuse +// tests don't cover: there we explicitly snapshot before reuse, here +// we read the live aliased view across reuse. +func TestQwpEgressIOInPlaceDecodeAliasing(t *testing.T) { + const wantReqID = int64(7) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, buildOneRowVarcharBatch(t, wantReqID, 0, "v", "ALPHA")) + m.sendBinary(ctx, buildOneRowVarcharBatch(t, wantReqID, 1, "v", "BRAVO")) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(wantReqID, 1, 2))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + // Pool size 2: the dispatcher can decode batch 1 into a + // different buffer while batch 0 is still held by the user. + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev0 := takeEventOrFail(t, io, 2*time.Second) + if ev0.kind != qwpEventKindBatch { + t.Fatalf("ev0 kind = %v, errMsg=%q", ev0.kind, ev0.errMessage) + } + if got := ev0.batch.batch.String(0, 0); got != "ALPHA" { + t.Fatalf("batch0 first read = %q, want ALPHA", got) + } + // Capture the aliased byte view too — the bytes themselves must + // stay stable, not just an accessor that happens to recopy them. + str0Before := ev0.batch.batch.Str(0, 0) + if string(str0Before) != "ALPHA" { + t.Fatalf("Str(0,0) = %q, want ALPHA", str0Before) + } + + // Pull batch 1 WITHOUT releasing batch 0. The dispatcher must + // take the second buffer from the pool and decode payload 1 into + // it; batch 0's view must remain untouched. + ev1 := takeEventOrFail(t, io, 2*time.Second) + if ev1.kind != qwpEventKindBatch { + t.Fatalf("ev1 kind = %v, errMsg=%q", ev1.kind, ev1.errMessage) + } + if got := ev1.batch.batch.String(0, 0); got != "BRAVO" { + t.Fatalf("batch1 read = %q, want BRAVO", got) + } + if ev1.batch == ev0.batch { + t.Fatal("dispatcher reused the still-held batch buffer; pool isolation broken") + } + + // Re-read batch 0 AFTER batch 1 has been decoded. Without + // cross-batch isolation the alias would now resolve to BRAVO. + if got := ev0.batch.batch.String(0, 0); got != "ALPHA" { + t.Fatalf("batch0 re-read after batch1 decode = %q, want ALPHA", got) + } + // The aliased byte view captured before batch 1 arrived must + // also still resolve to the same bytes — a stale slice header + // pointing into a clobbered buffer would surface here. + if string(str0Before) != "ALPHA" { + t.Fatalf("aliased Str(0,0) drifted to %q after batch1 decode, want ALPHA", str0Before) + } + + ev0.batch.release() + ev1.batch.release() + + end := takeEventOrFail(t, io, 2*time.Second) + if end.kind != qwpEventKindEnd { + t.Fatalf("end kind = %v, errMsg=%q", end.kind, end.errMessage) + } +} + +// TestQwpEgressIOCreditReplenish confirms that a query opted into flow +// control emits a CREDIT frame on the wire after each batch release, +// carrying the exact payload-byte count. +func TestQwpEgressIOCreditReplenish(t *testing.T) { + const wantReqID = int64(11) + const initialCredit = int64(64 * 1024) + + creditFrames := make(chan []byte, 4) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + _, _, credit := parseQueryRequest(t, req) + if credit != initialCredit { + t.Errorf("server saw credit=%d, want %d", credit, initialCredit) + } + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 1)) + + // Block until the client sends CREDIT. Client control frames + // have no QWP header — they are just msg_kind + body. + for { + f := m.readBinary(ctx) + if f[0] == byte(qwpMsgKindCredit) { + creditFrames <- f + break + } + } + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(wantReqID, 0, 1))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{ + sql: "SELECT 1", + requestId: wantReqID, + initialCredit: initialCredit, + }); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindBatch { + t.Fatalf("ev kind = %v", ev.kind) + } + wantBytes := ev.batch.payloadLen + ev.batch.release() + + // Credit frame should arrive at the server; check the byte count + // on it matches the batch size. CREDIT layout: msg_kind(1) + + // request_id(8) + additional_bytes(varint). + select { + case frame := <-creditFrames: + p := 1 + 8 + got, _, err := qwpReadVarint(frame[p:]) + if err != nil { + t.Fatalf("bad CREDIT varint: %v", err) + } + if int64(got) != int64(wantBytes) { + t.Errorf("CREDIT bytes = %d, want %d", got, wantBytes) + } + case <-time.After(2 * time.Second): + t.Fatal("no CREDIT frame seen") + } + + endEv := takeEventOrFail(t, io, 2*time.Second) + if endEv.kind != qwpEventKindEnd { + t.Fatalf("final event kind = %v, want End", endEv.kind) + } +} + +// TestQwpEgressIOUnknownMsgKind has the server send a bogus msg_kind +// and verifies the I/O loop emits a synthesized error and terminates +// the query. +func TestQwpEgressIOUnknownMsgKind(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + // Frame with an unknown msg_kind byte (0x7F). + m.sendBinary(ctx, writeQwpFrame(0, []byte{0x7F})) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 1) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindTransportError { + t.Fatalf("event kind = %v, want TransportError", ev.kind) + } + if !strings.Contains(ev.errMessage, "unknown msg_kind") { + t.Errorf("errMessage = %q, want unknown-msg-kind", ev.errMessage) + } +} + +// TestQwpEgressIOCacheResetBetweenQueries drives the server-emitted +// CACHE_RESET path end-to-end: query 1's response seeds the +// connection-scoped SYMBOL dict; the server then emits CACHE_RESET +// with mask=DICT; query 2 runs afterwards. Validates three invariants: +// - the dispatcher does not surface CACHE_RESET to the user (the +// event stream is {Batch, End} for Q1 and {ExecDone} for Q2); +// - the decoder's dict is cleared by the time Q2's terminal event +// is delivered; +// - nothing about Q2's normal completion is disturbed. +func TestQwpEgressIOCacheResetBetweenQueries(t *testing.T) { + const q1ReqID = int64(11) + const q2ReqID = int64(12) + + // Build Q1's RESULT_BATCH with a SYMBOL column so the delta dict + // section feeds qwpConnDict.entries. + globalDict := []string{"AAPL", "MSFT"} + tb := newQwpTableBuffer("t") + col, err := tb.getOrCreateColumn("s", qwpTypeSymbol, false) + if err != nil { + t.Fatalf("getOrCreateColumn: %v", err) + } + col.addSymbolID(0) + tb.commitRow() + col.addSymbolID(1) + tb.commitRow() + var enc qwpEncoder + q1Batch := wrapAsResultBatch( + enc.encodeTableWithDeltaDict(tb, globalDict, -1, 1), + q1ReqID, 0) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Query 1: batch with symbols + RESULT_END, then CACHE_RESET. + m.readBinary(ctx) + m.sendBinary(ctx, q1Batch) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(q1ReqID, 0, 2))) + m.sendBinary(ctx, writeQwpFrame(0, buildCacheResetBody(qwpResetMaskDict))) + + // Query 2: a plain EXEC_DONE. If the dispatcher were to leak + // CACHE_RESET as an event, the test's event sequence would pick + // that up before the ExecDone. + m.readBinary(ctx) + m.sendBinary(ctx, writeQwpFrame(0, buildExecDoneBody(q2ReqID, 0x01, 0))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + // Query 1 → expect Batch, End; decoder state populated afterwards. + if err := io.submitQuery(ctx, qwpRequest{sql: "SELECT s FROM t", requestId: q1ReqID}); err != nil { + t.Fatalf("submitQuery q1: %v", err) + } + batchEv := takeEventOrFail(t, io, 2*time.Second) + if batchEv.kind != qwpEventKindBatch { + t.Fatalf("q1 first event = %v, want Batch (errMsg=%q)", batchEv.kind, batchEv.errMessage) + } + if got := batchEv.batch.batch.String(0, 0); got != "AAPL" { + t.Errorf("q1 batch row 0 = %q, want AAPL", got) + } + batchEv.batch.release() + endEv := takeEventOrFail(t, io, 2*time.Second) + if endEv.kind != qwpEventKindEnd { + t.Fatalf("q1 second event = %v, want End (errMsg=%q)", endEv.kind, endEv.errMessage) + } + + // Query 2 → expect ExecDone only (no CACHE_RESET event surfaces). + if err := io.submitQuery(ctx, qwpRequest{sql: "INSERT INTO t VALUES ('x')", requestId: q2ReqID}); err != nil { + t.Fatalf("submitQuery q2: %v", err) + } + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindExecDone { + t.Fatalf("q2 first event kind = %v, want ExecDone (errMsg=%q)", + ev.kind, ev.errMessage) + } + if ev.requestId != q2ReqID { + t.Errorf("q2 ExecDone requestId = %d, want %d", ev.requestId, q2ReqID) + } + + // Shut the dispatcher down so it cannot touch the decoder while we + // inspect — the happens-before via events channel already covers + // correctness; the shutdown makes the intent explicit for readers. + shutdownIO(t, io) + + if io.decoder.dict.size() != 0 { + t.Errorf("dict not cleared after CACHE_RESET: size=%d", io.decoder.dict.size()) + } +} + +// TestQwpEgressIOCacheResetTruncatedPoisons feeds a CACHE_RESET frame +// that ends right after the msg_kind byte (no reset_mask). The +// dispatcher must surface the decode error, poison the connection, +// and reject the next submitQuery immediately. +func TestQwpEgressIOCacheResetTruncatedPoisons(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, writeQwpFrame(0, []byte{byte(qwpMsgKindCacheReset)})) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 1) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindTransportError { + t.Fatalf("event kind = %v, want TransportError", ev.kind) + } + if !strings.Contains(ev.errMessage, "truncated before reset_mask") { + t.Errorf("errMessage = %q, want truncated-reset_mask", ev.errMessage) + } + + // A fresh submitQuery must now fail synchronously because the + // decoder state is untrustworthy. + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: 2}); err == nil { + t.Fatal("submitQuery after poison returned nil; expected latched ioErr") + } +} + +// TestQwpEgressIOConcurrentCancelAndShutdown stress-tests the cancel / +// shutdown races: a test-runner goroutine fires requestCancel while +// the test's main goroutine fires shutdown. Both should complete +// without a deadlock or a goroutine leak. +func TestQwpEgressIOConcurrentCancelAndShutdown(t *testing.T) { + ready := make(chan struct{}) + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + close(ready) + // Stall. + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: 99}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + <-ready + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + io.requestCancel(99) + }() + + shutCtx, shutCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer shutCancel() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown: %v", err) + } + wg.Wait() +} + +// TestQwpEgressIODecodeFailure feeds a RESULT_BATCH frame whose header +// is valid but body is truncated (just the msg_kind byte with nothing +// after it). handleResultBatch must return the borrowed buffer to the +// pool — stranding it would permanently leak a slot — surface a +// synthesized decode-error event, and terminate the query cleanly. +// Connection-level poisoning behavior after this path is covered by +// TestQwpEgressIODecodeFailurePoisons. +func TestQwpEgressIODecodeFailure(t *testing.T) { + const wantReqID = int64(17) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + // Valid header + RESULT_BATCH kind + zero-length body. decode() + // dispatches into parseFrameHeader (accepts), then tries to + // read the requestId int64 and fails with truncation. + m.sendBinary(ctx, writeQwpFrame(0, []byte{byte(qwpMsgKindResultBatch)})) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + const poolSize = 2 + io := newQwpEgressIO(tr, poolSize) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "SELECT 1", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindTransportError { + t.Fatalf("event kind = %v, want TransportError", ev.kind) + } + if !strings.Contains(ev.errMessage, "decode") { + t.Errorf("errMessage = %q, expected to contain \"decode\"", ev.errMessage) + } + + // The borrowed buffer must be back in the pool — the error branch + // of handleResultBatch explicitly returns it before emitting the + // event. Poll briefly because the event emit and the pool return + // happen on the dispatcher but we read from a different goroutine. + if !waitForPoolSize(io, poolSize, 500*time.Millisecond) { + t.Fatalf("buffer pool size = %d, want %d — decode-error path stranded a buffer", + len(io.buffers), poolSize) + } +} + +// TestQwpEgressIODecodeFailurePoisons verifies the terminal-flag +// contract: once a decode error desyncs the per-connection decoder +// state, ioErr is latched and every subsequent submitQuery returns +// it immediately — a fresh query must never be decoded against +// stale dict/schema state. Mirrors the ingest send loop's latched +// terminal error (recordFatal / sendLoopCheckError). +func TestQwpEgressIODecodeFailurePoisons(t *testing.T) { + const wantReqID = int64(31) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + // Truncated RESULT_BATCH — same shape as TestQwpEgressIODecodeFailure. + m.sendBinary(ctx, writeQwpFrame(0, []byte{byte(qwpMsgKindResultBatch)})) + // Hold the connection open so the reader does not synthesize + // its own "server closed" event that would race the decode + // error we're trying to observe as the terminal event. + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "SELECT 1", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery first: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindTransportError { + t.Fatalf("event kind = %v, want TransportError", ev.kind) + } + + // The latch is set on the dispatcher goroutine right before the + // error event hits the channel; by the time the user has observed + // the event, loadIoErr must also be populated. + gotLoad := io.loadIoErr() + if gotLoad == nil { + t.Fatalf("loadIoErr() = nil after decode failure, want latched error") + } + if !strings.Contains(gotLoad.Error(), "decode") { + t.Errorf("loadIoErr() = %q, expected to contain \"decode\"", gotLoad.Error()) + } + + // A follow-up submitQuery must fail synchronously with the latched + // error — not block, not succeed, not return a different error. + // Using a generous ctx timeout ensures we are not accidentally + // observing ctx expiry. + gotSubmit := io.submitQuery(ctx, qwpRequest{sql: "SELECT 2", requestId: wantReqID + 1}) + if gotSubmit == nil { + t.Fatalf("submitQuery after decode failure: got nil error, want latched decode error") + } + if gotSubmit != gotLoad { + t.Errorf("submitQuery returned %q, want identity with latched %q", + gotSubmit.Error(), gotLoad.Error()) + } +} + +// TestQwpEgressIOReleaseAfterShutdown exercises the closed.Load() +// early-exit in releaseBuffer: a user that holds onto a batch across +// shutdown must be able to call release() without panicking, +// blocking, or corrupting the already-drained pool. +func TestQwpEgressIOReleaseAfterShutdown(t *testing.T) { + const wantReqID = int64(23) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 1)) + // Keep the connection open so the client's shutdown drives + // the teardown (rather than the server closing first and the + // reader emitting its own synthetic error). + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindBatch { + t.Fatalf("event kind = %v, want Batch", ev.kind) + } + heldBuf := ev.batch + + // Shutdown WITHOUT releasing the buffer the user still holds. + shutCtx, shutCancel := context.WithTimeout(context.Background(), 1*time.Second) + defer shutCancel() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown: %v", err) + } + // Post-shutdown invariant: the dispatcher sets closed=true in its + // defer before doneCh fires (which is what unblocks shutdown). + if !io.closed.Load() { + t.Fatal("dispatcher didn't set closed=true before exiting") + } + + poolBefore := len(io.buffers) + creditBefore := io.pendingCredit.Load() + + // release after shutdown must return promptly: the early-exit + // path skips the pool send and the notify. Runs in a goroutine + // with a timeout so a hypothetical deadlock surfaces as a test + // failure rather than hanging the suite. + done := make(chan struct{}) + go func() { + defer close(done) + heldBuf.release() + }() + select { + case <-done: + case <-time.After(500 * time.Millisecond): + t.Fatal("releaseBuffer after shutdown blocked") + } + + // The early-exit skips pendingCredit.Add and the pool send — the + // observable state should be unchanged. Without the closed check, + // a post-shutdown release would leave buf dangling on io.buffers + // with no consumer to drain it. + if got := len(io.buffers); got != poolBefore { + t.Errorf("pool size changed after post-shutdown release: before=%d after=%d", + poolBefore, got) + } + if got := io.pendingCredit.Load(); got != creditBefore { + t.Errorf("pendingCredit changed after post-shutdown release: before=%d after=%d", + creditBefore, got) + } + + // A second release on the same buffer must also stay harmless. + done2 := make(chan struct{}) + go func() { + defer close(done2) + heldBuf.release() + }() + select { + case <-done2: + case <-time.After(500 * time.Millisecond): + t.Fatal("second releaseBuffer after shutdown blocked") + } +} + +// TestQwpEgressIOReleaseClosePoolRace races releaseBuffer against the +// dispatcher's exit-defer (closed.Store(true) + close(events)) across +// 200 iterations to surface any TOCTOU bug in the closed.Load() guard +// in releaseBuffer. Mirrors Java's QwpEgressIoThreadCloseRaceTest. +// +// In the Java client the concern is a leaked native scratch buffer: +// a user thread reads closed==false, pauses, lets closePool drain +// freeBuffers, then offers its buffer into the now-emptied queue and +// the buffer's native memory leaks. Go's qwpBatchBuffer holds only +// GC-managed slices, so the failure mode here is narrower — what we +// pin is that the release/exit pair never panics, never blocks, and +// has no data race detectable under -race. The existing single-shot +// TestQwpEgressIOReleaseAfterShutdown only covers the post-shutdown +// case; the close-during-release window needs the loop. +func TestQwpEgressIOReleaseClosePoolRace(t *testing.T) { + const iterations = 50 + for iter := 0; iter < iterations; iter++ { + runReleaseClosePoolRaceOnce(t, iter) + } +} + +func runReleaseClosePoolRaceOnce(t *testing.T, iter int) { + // A real, started egress IO so the race runs against the REAL + // dispatcher teardown driven by shutdown() — not a hand-rolled copy + // of its exit defers, which would silently go stale if the teardown + // sequence ever changed. The mock just idles; no query is needed — + // shutdown() alone makes the dispatcher return and run its exit + // defers (decoder.close, close(events), closed.Store(true)). + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + for { + if _, _, err := m.conn.Read(ctx); err != nil { + return + } + } + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + // Pull both pool buffers out so we can release them — what the + // dispatcher would have handed to the user as batches. + b0 := <-io.buffers + b1 := <-io.buffers + + start := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer wg.Done() + <-start + io.releaseBuffer(b0) + io.releaseBuffer(b1) + }() + go func() { + defer wg.Done() + <-start + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = io.shutdown(ctx) + }() + + // Release the start gate so both goroutines hit the racing section + // as close to simultaneously as the runtime allows. + close(start) + + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + select { + case <-done: + case <-time.After(3 * time.Second): + t.Fatalf("iteration %d: race between releaseBuffer and shutdown deadlocked", iter) + } +} + +// TestQwpEgressIOTakeEventWakesOnShutdown parks a consumer on +// takeEvent with nothing queued, then shuts the dispatcher down. The +// consumer must wake with a terminal error rather than blocking on an +// open-but-silent channel until its own ctx expires. This is the +// guarantee that replaced the old best-effort postShutdownSentinel — +// closing the events channel means a parked consumer always wakes. +func TestQwpEgressIOTakeEventWakesOnShutdown(t *testing.T) { + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + // Never reply — the consumer will be parked waiting. + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + submitCtx, submitCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer submitCancel() + if err := io.submitQuery(submitCtx, qwpRequest{sql: "x", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // Park a goroutine inside takeEvent with a ctx that won't fire + // before our shutdown does — if the channel-close signal doesn't + // wake takeEvent, this assertion would have to wait for the ctx. + done := make(chan error, 1) + go func() { + waitCtx, waitCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer waitCancel() + _, err := io.takeEvent(waitCtx) + done <- err + }() + + // Small sleep to raise the probability that the goroutine is + // actually parked inside the takeEvent select when shutdown + // fires. Not a correctness requirement — even if the goroutine + // hasn't reached the select yet, close(events) happens-before the + // receive, so takeEvent still returns the terminal error. + time.Sleep(50 * time.Millisecond) + + shutCtx, shutCancel := context.WithTimeout(context.Background(), 1*time.Second) + defer shutCancel() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown: %v", err) + } + + select { + case err := <-done: + if err == nil { + t.Fatal("takeEvent returned nil after shutdown; expected terminal error") + } + if !strings.Contains(err.Error(), "terminated") { + t.Errorf("takeEvent error = %q, want substring \"terminated\"", err) + } + case <-time.After(500 * time.Millisecond): + t.Fatal("takeEvent did not wake within 500ms of shutdown") + } +} + +// TestQwpEgressIOShutdownPreservesQueuedEvents verifies that events +// already buffered on io.events at shutdown aren't dropped: the +// consumer drains them normally and only afterwards sees the +// closed-channel signal. Regression guard against an over-eager +// postShutdownSentinel design that would have had to discard queued +// events to make room for its own terminal message. +func TestQwpEgressIOShutdownPreservesQueuedEvents(t *testing.T) { + const wantReqID = int64(29) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, buildOneRowInt64Batch(t, wantReqID, 0, "v", 42)) + // Stay connected so the client's reader doesn't see a close + // and synthesize a transport error before the test's own + // shutdown fires — we want the batch event to be the only + // thing on io.events when we tear down. + time.Sleep(500 * time.Millisecond) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + + io := newQwpEgressIO(tr, 2) + io.start() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "x", requestId: wantReqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // Wait for the dispatcher to actually deliver the batch event + // onto io.events. <-serverSide is not enough — the client's + // reader + dispatcher may not have processed the frame yet. + // len(chan) is a safe atomic read at runtime. + if !waitForEventsCount(io, 1, 500*time.Millisecond) { + t.Fatalf("batch event never queued: len(events)=%d", len(io.events)) + } + + // Shut down WITHOUT draining. The batch event stays queued. + shutCtx, shutCancel := context.WithTimeout(context.Background(), 1*time.Second) + defer shutCancel() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown: %v", err) + } + + // Drain — the batch must still be recoverable despite the + // channel having been closed by the dispatcher's defer. + ev := takeEventOrFail(t, io, 500*time.Millisecond) + if ev.kind != qwpEventKindBatch { + t.Fatalf("first event kind = %v, want Batch (errMsg=%q)", ev.kind, ev.errMessage) + } + if got := ev.batch.batch.Int64(0, 0); got != 42 { + t.Errorf("queued batch value = %d, want 42", got) + } + ev.batch.release() + + // Next take must see the terminal signal now that the queue is + // drained — from the channel close, not a synthesized event. + takeCtx, takeCancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer takeCancel() + if _, err := io.takeEvent(takeCtx); err == nil { + t.Fatal("post-drain takeEvent returned no error; expected terminal error") + } else if !strings.Contains(err.Error(), "terminated") { + t.Errorf("post-drain takeEvent error = %q, want substring \"terminated\"", err) + } +} + +// --- shared helpers --- + +func takeEventOrFail(t *testing.T, io *qwpEgressIO, timeout time.Duration) qwpEvent { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ev, err := io.takeEvent(ctx) + if err != nil { + t.Fatalf("takeEvent: %v", err) + } + return ev +} + +// drainBatchesToEnd reads events until an End event is seen, asserting +// the expected number of batches arrives first. Returns the Int64(0,0) +// value of each batch for caller-side sanity checks. +func drainBatchesToEnd(t *testing.T, io *qwpEgressIO, wantBatches int) []int64 { + t.Helper() + var values []int64 + for i := 0; i < wantBatches; i++ { + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindBatch { + t.Fatalf("event %d: kind = %v, errMsg=%q", i, ev.kind, ev.errMessage) + } + values = append(values, ev.batch.batch.Int64(0, 0)) + ev.batch.release() + } + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindEnd { + t.Fatalf("final event: kind = %v, errMsg=%q", ev.kind, ev.errMessage) + } + return values +} + +// shutdownIO wraps qwpEgressIO.shutdown with a bounded context for +// deferred cleanup in tests. Not fatal on error — the goroutine may +// already have exited on its own after a server error, in which case +// shutdown is a no-op. +func shutdownIO(t *testing.T, io *qwpEgressIO) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.shutdown(ctx); err != nil { + t.Logf("shutdown: %v", err) + } +} + +// waitForPoolSize polls len(io.buffers) until it reaches want or the +// timeout expires. Used where the assertion races with the dispatcher +// wrapping up — e.g. after a decode error, where the pool-return and +// the event emit happen on the dispatcher but the test reads the +// event on a different goroutine. +func waitForPoolSize(io *qwpEgressIO, want int, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for { + if len(io.buffers) == want { + return true + } + if time.Now().After(deadline) { + return len(io.buffers) == want + } + time.Sleep(10 * time.Millisecond) + } +} + +// waitForEventsCount polls len(io.events) until it reaches at least +// want or the timeout expires. Used by the shutdown-preserves-queued +// test to synchronize on the dispatcher having actually delivered an +// event to the consumer-visible channel (rather than just read it +// from the wire). +func waitForEventsCount(io *qwpEgressIO, want int, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for { + if len(io.events) >= want { + return true + } + if time.Now().After(deadline) { + return len(io.events) >= want + } + time.Sleep(10 * time.Millisecond) + } +} + +// newStalledTransport returns a qwpTransport whose WebSocket conn is +// wired to an in-process net.Pipe. The server side completes the HTTP +// upgrade, optionally emits preSend bytes right after the upgrade +// (for seeding a valid inbound WebSocket frame before the stall), and +// then stops reading. Because net.Pipe is synchronous and unbuffered, +// any subsequent client-side Write blocks until the pipe is closed. +// Use this to simulate a hung peer (TCP zero-window, stuck +// application) without relying on OS socket buffer sizes. +// +// The caller must arrange for the returned clientConn to be closed at +// test end so a blocked Write unwinds and goroutines don't leak. +func newStalledTransport(t *testing.T, preSend []byte) (tr *qwpTransport, clientConn net.Conn) { + t.Helper() + clientConn, serverConn := net.Pipe() + + stallDone := make(chan struct{}) + t.Cleanup(func() { + close(stallDone) + }) + + go func() { + defer serverConn.Close() + br := bufio.NewReader(serverConn) + var wsKey string + for { + line, err := br.ReadString('\n') + if err != nil { + return + } + line = strings.TrimRight(line, "\r\n") + if line == "" { + break + } + if len(line) > 20 && strings.EqualFold(line[:19], "Sec-WebSocket-Key: ") { + wsKey = strings.TrimSpace(line[19:]) + } + } + h := sha1.New() + h.Write([]byte(wsKey + wsAcceptGUID)) + accept := base64.StdEncoding.EncodeToString(h.Sum(nil)) + resp := "HTTP/1.1 101 Switching Protocols\r\n" + + "Upgrade: websocket\r\n" + + "Connection: Upgrade\r\n" + + "Sec-WebSocket-Accept: " + accept + "\r\n" + + qwpHeaderVersion + ": " + fmt.Sprintf("%d", qwpVersion) + "\r\n" + + "\r\n" + if _, err := serverConn.Write([]byte(resp)); err != nil { + return + } + if len(preSend) > 0 { + if _, err := serverConn.Write(preSend); err != nil { + return + } + } + // Stall: never read again. The client's next Write blocks + // because net.Pipe has no buffer. + <-stallDone + }() + + dialCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + conn, resp, err := websocket.Dial(dialCtx, "ws://stall.local"+qwpReadPath, &websocket.DialOptions{ + HTTPHeader: http.Header{ + qwpHeaderMaxVersion: []string{fmt.Sprintf("%d", qwpVersion)}, + qwpHeaderClientId: []string{qwpClientId}, + }, + HTTPClient: &http.Client{ + Transport: &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return clientConn, nil + }, + }, + }, + }) + if err != nil { + t.Fatalf("dial: %v", err) + } + if resp != nil && resp.Body != nil { + _ = resp.Body.Close() + } + conn.SetReadLimit(-1) + + return &qwpTransport{conn: conn}, clientConn +} + +// TestQwpEgressIOShutdownUnblocksStuckWrite checks that qwpEgressIO's +// shutdown returns promptly even when the dispatcher is parked inside +// a conn.Write that the peer has stopped draining AND the reader is +// not currently inside conn.Read. Regression guard for sendMessage +// passing context.Background(): with that bug the Write has no ctx +// to observe shutdown, so it stays parked until the underlying +// transport is torn down externally. Cancelling readCtx does NOT +// help here — coder/websocket only tears down the underlying net.Conn +// via the AfterFunc registered during an active Read, and that +// AfterFunc has been unregistered by the time the reader parks on +// frameCh. +// +// Scenario setup: +// +// 1. Server upgrades, emits one valid binary WS frame, then stalls. +// 2. Reader receives the frame, returns from conn.Read (Read timeout +// AfterFunc cleared), and parks on the frameCh/shutdownCh select. +// 3. User submits a query. Dispatcher picks it up and enters +// sendQueryRequest → conn.Write; net.Pipe blocks the Write because +// the server is no longer reading. +// 4. shutdown is called. Reader wakes via shutdownCh and exits. The +// dispatcher must also wind down within the timeout — only a +// shutdown-aware Write ctx can guarantee that. +func TestQwpEgressIOShutdownUnblocksStuckWrite(t *testing.T) { + // One valid server-to-client binary WS frame: FIN+binary opcode, + // 1-byte payload (content is irrelevant — the dispatcher never + // decodes it, because it's stuck in Write before reaching + // receiveLoop). + preSend := []byte{0x82, 0x01, 0x00} + tr, clientConn := newStalledTransport(t, preSend) + t.Cleanup(func() { _ = clientConn.Close() }) + + io := newQwpEgressIO(tr, 2) + io.start() + + // Let the reader pull the pre-sent frame off the wire and park on + // the frameCh send — at which point it is no longer inside + // conn.Read and readCtx cancellation can no longer tear down the + // underlying net.Conn via coder/websocket's read AfterFunc. + time.Sleep(100 * time.Millisecond) + + submitCtx, submitCancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer submitCancel() + if err := io.submitQuery(submitCtx, qwpRequest{sql: "SELECT 1", requestId: 1}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // Give the dispatcher time to pick up the request and park inside + // conn.Write on the stalled pipe. + time.Sleep(100 * time.Millisecond) + + shutCtx, shutCancel := context.WithTimeout(context.Background(), 1*time.Second) + defer shutCancel() + start := time.Now() + err := io.shutdown(shutCtx) + elapsed := time.Since(start) + if err != nil { + t.Fatalf("shutdown returned %v after %v; want clean return — sendMessage ctx must participate in shutdown", err, elapsed) + } + if elapsed > 500*time.Millisecond { + t.Fatalf("shutdown took %v; want well under 500ms — dispatcher was stuck in Write past shutdown signal", elapsed) + } +} + +// fillFrameReader fills every Read fully and never reports io.EOF, +// modelling a hostile or buggy server streaming an unbounded frame. +type fillFrameReader struct{} + +func (fillFrameReader) Read(p []byte) (int, error) { return len(p), nil } + +// TestQwpReadFrameIntoCeiling pins the defense-in-depth ceiling: an +// unbounded inbound frame must be rejected without growing the buffer +// past qwpMaxFrameReadLimit (host-OOM hardening), while a legitimate +// frame of exactly qwpMaxBatchSize — the egress decoder's own accept +// boundary — must still be read in full. The latter is what +// qwpReadLimitSlack buys: coder/websocket's limitReader and this +// function would otherwise false-reject an exactly-cap frame whose +// terminal io.EOF arrives on a separate Read. +func TestQwpReadFrameIntoCeiling(t *testing.T) { + buf := make([]byte, 0) + pb := &buf + out, err := qwpReadFrameInto(fillFrameReader{}, pb) + if err == nil { + t.Fatalf("unbounded frame: expected error, got nil (len=%d)", len(out)) + } + if !strings.Contains(err.Error(), "exceeds") { + t.Fatalf("unbounded frame: unexpected error: %v", err) + } + if cap(*pb) > qwpMaxFrameReadLimit { + t.Fatalf("buffer grew to cap %d, exceeds ceiling %d", cap(*pb), qwpMaxFrameReadLimit) + } + + buf2 := make([]byte, 0) + pb2 := &buf2 + out2, err := qwpReadFrameInto(io.LimitReader(fillFrameReader{}, qwpMaxBatchSize), pb2) + if err != nil { + t.Fatalf("exact-qwpMaxBatchSize frame rejected: %v", err) + } + if len(out2) != qwpMaxBatchSize { + t.Fatalf("exact-qwpMaxBatchSize frame: got %d bytes, want %d", len(out2), qwpMaxBatchSize) + } +} + +// TestQwpEgressIOCacheResetMidQuery drives a CACHE_RESET interleaved +// between two RESULT_BATCH frames of the SAME query. The server contract +// is that CACHE_RESET arrives between queries, but the dispatcher must +// not be tripped up if one lands mid-query: it consumes the frame +// silently (no user-visible event, the query is not terminated) and +// clears the connection dict, after which the continuation batch +// re-seeds the dict from id 0 and decodes normally. +// +// The continuation's delta carries deltaStart=0, which qwpConnDict +// accepts only when the dict was actually cleared (otherwise appendDelta +// rejects it as out of sync) — so a regression that dropped or +// mis-ordered the mid-query reset surfaces here as a decode error on +// batch 1 rather than a silent pass. +func TestQwpEgressIOCacheResetMidQuery(t *testing.T) { + const reqID = int64(21) + globalDict := []string{"AAPL", "MSFT"} + + // batch_seq 0: rows AAPL, MSFT (ids 0,1); seeds dict ids 0..1. + tb0 := newQwpTableBuffer("t") + for _, id := range []int32{0, 1} { + col, _ := tb0.getOrCreateColumn("s", qwpTypeSymbol, false) + col.addSymbolID(id) + tb0.commitRow() + } + var enc qwpEncoder + batch0 := wrapAsResultBatch(enc.encodeTableWithDeltaDict(tb0, globalDict, -1, 1), reqID, 0) + + // batch_seq 1 (continuation): rows MSFT, AAPL (ids 1,0). Re-advertises + // ids 0..1 from deltaStart=0 — valid only because the mid-query + // CACHE_RESET cleared the dict first. + tb1 := newQwpTableBuffer("t") + for _, id := range []int32{1, 0} { + col, _ := tb1.getOrCreateColumn("s", qwpTypeSymbol, false) + col.addSymbolID(id) + tb1.commitRow() + } + batch1 := wrapAsResultBatch(enc.encodeTableWithDeltaDict(tb1, globalDict, -1, 1), reqID, 1) + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + m.readBinary(ctx) + m.sendBinary(ctx, batch0) + m.sendBinary(ctx, writeQwpFrame(0, buildCacheResetBody(qwpResetMaskDict))) + m.sendBinary(ctx, batch1) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 1, 4))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{sql: "SELECT s FROM t", requestId: reqID}); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // The mid-query CACHE_RESET is consumed silently: the event stream is + // exactly {Batch, Batch, End}. + ev0 := takeEventOrFail(t, io, 2*time.Second) + if ev0.kind != qwpEventKindBatch { + t.Fatalf("event 0 = %v, want Batch (errMsg=%q)", ev0.kind, ev0.errMessage) + } + if a, b := ev0.batch.batch.String(0, 0), ev0.batch.batch.String(0, 1); a != "AAPL" || b != "MSFT" { + t.Errorf("batch 0 rows = %q,%q, want AAPL,MSFT", a, b) + } + ev0.batch.release() + + ev1 := takeEventOrFail(t, io, 2*time.Second) + if ev1.kind != qwpEventKindBatch { + t.Fatalf("event 1 = %v, want Batch (errMsg=%q)", ev1.kind, ev1.errMessage) + } + if a, b := ev1.batch.batch.String(0, 0), ev1.batch.batch.String(0, 1); a != "MSFT" || b != "AAPL" { + t.Errorf("batch 1 rows = %q,%q, want MSFT,AAPL", a, b) + } + ev1.batch.release() + + end := takeEventOrFail(t, io, 2*time.Second) + if end.kind != qwpEventKindEnd { + t.Fatalf("event 2 = %v, want End (errMsg=%q)", end.kind, end.errMessage) + } + + // The continuation re-seeded the dict from id 0 after the reset. + shutdownIO(t, io) + if got := io.decoder.dict.size(); got != 2 { + t.Errorf("dict size after reset+reseed = %d, want 2", got) + } +} + +// TestQwpEgressIOCreditStarvationNeverReleases pins the behavior when a +// flow-controlled query's consumer reads a batch and then never releases +// it: with the buffer pool exhausted, the dispatcher parks (no busy-spin, +// no further events) and — because CREDIT is only emitted on release — +// the server is starved of credit (no CREDIT frame is sent). shutdown +// must still unblock the parked dispatcher and return cleanly, proving +// no deadlock or goroutine leak. +func TestQwpEgressIOCreditStarvationNeverReleases(t *testing.T) { + const reqID = int64(31) + const initialCredit = int64(64 * 1024) + + sawCredit := make(chan struct{}, 1) + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + if _, _, credit := parseQueryRequest(t, req); credit != initialCredit { + t.Errorf("server saw credit=%d, want %d", credit, initialCredit) + } + // Two batches: with pool size 1 the client decodes the first and + // parks acquiring a buffer for the second. + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 10)) + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 1, "v", 20)) + // Watch for a CREDIT frame until the client disconnects. A + // never-releasing consumer sends none. Read directly (not + // readBinary) so the expected close/cancel is not fatal. + for { + typ, data, err := m.conn.Read(ctx) + if err != nil { + return + } + if typ == websocket.MessageBinary && len(data) > 0 && data[0] == byte(qwpMsgKindCredit) { + select { + case sawCredit <- struct{}{}: + default: + } + return + } + } + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + io := newQwpEgressIO(tr, 1) // pool of size 1 + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{ + sql: "SELECT v FROM t", + requestId: reqID, + initialCredit: initialCredit, + }); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + // Read the first batch and HOLD it — never release. + ev := takeEventOrFail(t, io, 2*time.Second) + if ev.kind != qwpEventKindBatch { + t.Fatalf("first event = %v, want Batch (errMsg=%q)", ev.kind, ev.errMessage) + } + + // The dispatcher parks on the exhausted pool: no second event arrives. + shortCtx, shortCancel := context.WithTimeout(context.Background(), 300*time.Millisecond) + if _, err := io.takeEvent(shortCtx); err == nil { + shortCancel() + t.Fatal("event arrived while the consumer starved the pool") + } + shortCancel() + + // No CREDIT is emitted while the batch is held. + select { + case <-sawCredit: + t.Fatal("client emitted CREDIT despite the consumer never releasing") + case <-time.After(800 * time.Millisecond): + } + + // shutdown must unblock the parked dispatcher and return cleanly, + // even though the held batch is never released. + shutCtx, shutCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer shutCancel() + start := time.Now() + if err := io.shutdown(shutCtx); err != nil { + t.Fatalf("shutdown returned %v; want clean return despite a never-releasing consumer", err) + } + if elapsed := time.Since(start); elapsed > time.Second { + t.Fatalf("shutdown took %v; dispatcher did not unblock promptly", elapsed) + } +} + +// TestQwpEgressIOBindPath verifies the egress bind path end-to-end at the +// I/O layer: typed binds encoded via QwpBinds are carried verbatim in the +// QUERY_REQUEST after the bind_count field, and the query then completes +// normally. Unit-level coverage for bind transmission, which is otherwise +// exercised only by the server-fixture fuzz tests. +func TestQwpEgressIOBindPath(t *testing.T) { + const reqID = int64(41) + const wantSQL = "SELECT * FROM t WHERE a = $1 AND b = $2" + + // Encode two typed binds the way QwpQueryClient.buildRequest does. + var binds QwpBinds + binds.reset() + binds.LongBind(0, 0x0123456789ABCDEF).VarcharBind(1, "needle") + if err := binds.Err(); err != nil { + t.Fatalf("encode binds: %v", err) + } + wantBindPayload := append([]byte(nil), binds.bufferBytes()...) + wantBindCount := binds.Count() + if wantBindCount != 2 { + t.Fatalf("bind count = %d, want 2", wantBindCount) + } + + srv := newQwpMockEgressServer(t, func(m *qwpMockEgressConn) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + req := m.readBinary(ctx) + gotID, gotSQL, _ := parseQueryRequest(t, req) + if gotID != reqID { + t.Errorf("server saw requestId=%d, want %d", gotID, reqID) + } + if gotSQL != wantSQL { + t.Errorf("server saw sql=%q, want %q", gotSQL, wantSQL) + } + // The typed bind block is the tail of QUERY_REQUEST after the + // bind_count varint; verify it byte-for-byte against the client + // encoding. + if !strings.HasSuffix(string(req), string(wantBindPayload)) { + t.Errorf("QUERY_REQUEST missing expected %d-byte bind payload suffix", len(wantBindPayload)) + } + m.sendBinary(ctx, buildOneRowInt64Batch(t, reqID, 0, "v", 777)) + m.sendBinary(ctx, writeQwpFrame(0, buildResultEndBody(reqID, 0, 1))) + }) + defer srv.Close() + + tr := connectEgress(t, srv.URL) + defer tr.close() + io := newQwpEgressIO(tr, 2) + io.start() + defer shutdownIO(t, io) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := io.submitQuery(ctx, qwpRequest{ + sql: wantSQL, + requestId: reqID, + bindCount: wantBindCount, + bindPayload: wantBindPayload, + }); err != nil { + t.Fatalf("submitQuery: %v", err) + } + + values := drainBatchesToEnd(t, io, 1) + if len(values) != 1 || values[0] != 777 { + t.Fatalf("batch values = %v, want [777]", values) + } +} diff --git a/qwp_race_off_test.go b/qwp_race_off_test.go new file mode 100644 index 00000000..175ba31b --- /dev/null +++ b/qwp_race_off_test.go @@ -0,0 +1,29 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !race + +package questdb + +const raceEnabled = false diff --git a/qwp_race_on_test.go b/qwp_race_on_test.go new file mode 100644 index 00000000..ffdeaeb8 --- /dev/null +++ b/qwp_race_on_test.go @@ -0,0 +1,29 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build race + +package questdb + +const raceEnabled = true diff --git a/qwp_segment_cap_guard_test.go b/qwp_segment_cap_guard_test.go new file mode 100644 index 00000000..e14e31f9 --- /dev/null +++ b/qwp_segment_cap_guard_test.go @@ -0,0 +1,224 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "fmt" + "strconv" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestQwpSegmentCapGuardDropsOversizeBatch is the regression test for +// the self-wedging cursor sender on the irreducible single-table case: a +// flush whose only table encodes to a frame larger than the per-segment +// byte cap must be DROPPED with a typed error, not retained forever. +// +// The per-table split can rescue a multi-table batch that overruns the +// cap only by aggregation (TestQwpSplitFlush* covers that), but a lone +// table over the cap is irreducible: the segment cap never grows, so +// re-encoding it on every subsequent Flush — and on Close — would fail +// identically forever and lose the batch anyway. This pins the +// recoverable behavior: the over-cap batch is dropped in place and the +// sender stays usable. Segment-cap analogue of TestQwpFlushTimeGuardFires. +func TestQwpSegmentCapGuardDropsOversizeBatch(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + // Memory-mode cursor with a 4096-byte segment and no auto-flush + // (autoFlushRows=0; the constructor wires autoFlushBytes=0 and + // maxBufSize=0). Every row stays pending until we explicitly Flush, + // so the whole batch lands in a single frame. + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // The SF test server advertises no X-QWP-Max-Batch-Size, so the + // server-cap guards are inert and the 4 KiB segment is the only + // binding limit. + require.Zero(t, s.serverMaxBatchSize.Load(), + "test precondition: no server cap, so the segment is the binding limit") + + ctx := context.Background() + + // ~20 KiB of column data — far past anything a 4 KiB segment can + // hold even after rotation into a fresh spare. + const rows = 100 + big := strings.Repeat("x", 200) + for i := 0; i < rows; i++ { + require.NoError(t, s.Table("t"). + StringColumn("s", big). + Int64Column("i", int64(i)). + AtNow(ctx), "row %d", i) + } + require.Equal(t, rows, s.pendingRowCount) + + // Flush must surface a typed error AND drop the batch. + err := s.Flush(ctx) + require.Error(t, err, "an over-segment batch must surface an error") + require.Contains(t, err.Error(), fmt.Sprintf("droppedRows=%d", rows), + "error must name the dropped row count") + + // Keystone: the batch was DROPPED, not retained. Pre-fix this stays + // at `rows` and the sender is wedged. + require.Zero(t, s.pendingRowCount, "over-segment batch must be dropped, not retained") + require.Zero(t, s.pendingBytes, "pendingBytes must reset alongside the dropped batch") + + // A second Flush is a clean no-op — proving the wedge is gone (pre-fix + // it re-failed identically forever). + require.NoError(t, s.Flush(ctx), "second Flush must be a clean no-op after the drop") + + // The sender remains usable: a small batch flushes through the same + // 4 KiB segment without error. + require.NoError(t, s.Table("t").Int64Column("i", 1).AtNow(ctx)) + require.NoError(t, s.Flush(ctx)) + require.Zero(t, s.pendingRowCount) +} + +// TestQwpSegmentCapGuardSurfacesOnClose pins the "data loss on Close" +// half of the report: an over-segment batch left pending at Close must +// be surfaced as a typed error (not silently lost) and Close must not +// hang or re-fail forever. closeCursor drops the batch via the same +// guard and returns the error as its first fault. +func TestQwpSegmentCapGuardSurfacesOnClose(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, _, _, _ := newCursorSenderForTest(t, srv, 0) + + ctx := context.Background() + big := strings.Repeat("x", 200) + for i := 0; i < 100; i++ { + require.NoError(t, s.Table("t").StringColumn("s", big).AtNow(ctx), "row %d", i) + } + + // Close returns promptly with the drop error — nothing was ever + // published, so the drain wait is a no-op; the batch is dropped + // rather than retained-and-re-failed. + done := make(chan error, 1) + go func() { done <- s.Close(ctx) }() + select { + case err := <-done: + require.Error(t, err, "Close must surface the dropped batch, not swallow it") + require.Contains(t, err.Error(), "cursor segment") + case <-time.After(10 * time.Second): + t.Fatal("Close hung on an over-segment batch (wedge not fixed)") + } +} + +// TestQwpSegmentClampKeepsTriggerBelowSegmentCap pins the no-wedge +// invariant behind the byte-trigger clamp: when the configured +// auto_flush_bytes exceeds what a segment can hold (the shipped-default +// shape: 8 MiB trigger over a 4 MiB segment), the effective trigger is +// clamped strictly below the segment frame cap, so the soft auto-flush +// always fires before a batch can grow into the drop guard. +func TestQwpSegmentClampKeepsTriggerBelowSegmentCap(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Configure a byte trigger larger than the 4 KiB segment — the + // self-wedging shape — then re-seed the effective trigger (no server + // cap advertised, so only the segment clamp applies). + s.autoFlushBytes = 2 * 4096 + s.applyServerBatchSizeLimit(nil) + + require.Equal(t, int64(4096)-qwpSfHeaderSize-qwpSfFrameHeaderSize, s.maxFrameBytes) + require.Equal(t, s.maxFrameBytes*9/10, s.effectiveAutoFlushBytes.Load(), + "trigger must clamp to 90%% of the segment frame cap") + require.Less(t, s.effectiveAutoFlushBytes.Load(), s.maxFrameBytes, + "clamped trigger must sit below the segment cap so auto-flush fires first") +} + +// TestQwpMaxFrameBytesMatchesSegmentBoundary pins the no-drift +// invariant the flush-time drop guard and the clamp both rely on: +// engineMaxFrameBytes() is exactly the largest payload a fresh segment +// accepts. A frame of that size fits; one byte more does not. If the +// segment header layout ever changes without engineMaxFrameBytes +// tracking it, this fails loudly instead of silently re-opening the +// wedge. +func TestQwpMaxFrameBytesMatchesSegmentBoundary(t *testing.T) { + const segSize int64 = 4096 + maxFrame := segSize - qwpSfHeaderSize - qwpSfFrameHeaderSize + + eng, err := qwpSfNewCursorEngine("", segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = eng.engineClose() }() + require.Equal(t, maxFrame, eng.engineMaxFrameBytes()) + + fits, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + defer func() { _ = fits.close() }() + _, err = fits.tryAppend(make([]byte, maxFrame)) + require.NoError(t, err, "a payload of exactly engineMaxFrameBytes must fit a fresh segment") + + overflows, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + defer func() { _ = overflows.close() }() + _, err = overflows.tryAppend(make([]byte, maxFrame+1)) + require.ErrorIs(t, err, qwpSfErrSegmentFull, "one byte past engineMaxFrameBytes must not fit") +} + +// TestQwpTooManyTablesDropsBatchInsteadOfWedging covers the other +// member of the retain-forever family the report named: a batch with +// more than qwpMaxTablesPerBatch (65535, the uint16 table-count limit) +// distinct tables can never be encoded, so it is dropped with a typed +// error instead of being retained and re-failing on every flush. +func TestQwpTooManyTablesDropsBatchInsteadOfWedging(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + ctx := context.Background() + // One row into each of (cap + 1) distinct tables — one past the + // uint16 table-count limit. + const tables = qwpMaxTablesPerBatch + 1 + for i := 0; i < tables; i++ { + require.NoError(t, s.Table("t"+strconv.Itoa(i)). + Int64Column("v", int64(i)). + AtNow(ctx)) + } + require.Equal(t, tables, s.pendingRowCount) + + err := s.Flush(ctx) + require.Error(t, err) + require.Contains(t, err.Error(), "too many tables") + require.Contains(t, err.Error(), "droppedRows=") + + // Dropped, not wedged. + require.Zero(t, s.pendingRowCount) + + // Sender stays usable. + require.NoError(t, s.Table("ok").Int64Column("v", 1).AtNow(ctx)) + require.NoError(t, s.Flush(ctx)) + require.Zero(t, s.pendingRowCount) +} diff --git a/qwp_sender.go b/qwp_sender.go index b4b60b89..eeb999c9 100644 --- a/qwp_sender.go +++ b/qwp_sender.go @@ -30,6 +30,7 @@ import ( "fmt" "io" "math/big" + "sync/atomic" "time" ) @@ -98,66 +99,174 @@ type QwpSender interface { // row: mixing At and AtNano on rows of the same table within one // flush returns a type-conflict error. AtNano(ctx context.Context, ts time.Time) error + + // AckedFsn returns the highest server-acknowledged frame + // sequence number, or -1 if no batch has been ACK'd yet. + // Snapshot accessor — for a bounded wait, use AwaitAckedFsn. + AckedFsn() int64 + + // AwaitAckedFsn blocks until AckedFsn() >= target, ctx is + // cancelled / deadlines, or the I/O loop latches a terminal + // error. Returns nil on success; ctx.Err() on cancellation / + // deadline; *SenderError on a terminal server rejection. + // + // Useful for tests and user code that need to confirm a specific + // publish has been server-acknowledged. Wrap with + // context.WithTimeout for a bounded wait. Pair AwaitAckedFsn with + // the FSN returned by FlushAndGetSequence — none of the flush + // paths (explicit Flush, FlushAndGetSequence, auto-flush) wait + // for ACK, so AwaitAckedFsn is the only API that blocks on server + // acknowledgement. + AwaitAckedFsn(ctx context.Context, target int64) error + + // FlushAndGetSequence behaves identically to Flush but returns + // the published FSN (highest committed-to-disk-and-queued-for- + // wire frame sequence) post-flush. Distinct from AckedFsn(), + // which is the highest *server-acknowledged* sequence — the + // returned FSN is the upper bound of any SenderError.ToFsn that + // could surface for this batch. Use AwaitAckedFsn for ack + // confirmation. + FlushAndGetSequence(ctx context.Context) (int64, error) + + // LastTerminalError returns a snapshot of the most recent + // terminal SenderError the I/O loop latched (server rejection, + // WS protocol violation, auth failure, reconnect-budget + // exhaustion). Returns nil if the sender has not gone terminal + // yet, or if it failed for a non-server reason (transport + // error before classification). + LastTerminalError() *SenderError + + // TotalServerErrors returns the cumulative count of SenderError + // payloads the I/O loop has built (DROP and HALT combined). + // Includes batches where the user handler dropped the + // notification due to inbox overflow. + TotalServerErrors() int64 + + // DroppedErrorNotifications returns the cumulative count of + // SenderError payloads that did not reach the user-supplied + // handler because the bounded inbox was full at offer time. + // Non-zero means the handler is too slow for the error rate; + // raise WithErrorInboxCapacity or speed up the handler. + DroppedErrorNotifications() int64 + + // TotalErrorNotificationsDelivered returns the cumulative count + // of SenderError payloads delivered to the user-supplied + // handler. Includes deliveries where the handler panicked + // (caught by the dispatcher). + TotalErrorNotificationsDelivered() int64 + + // TotalReconnectAttempts returns the cumulative count of + // reconnect attempts the I/O loop has issued — succeeded plus + // failed. Diverges from TotalReconnectsSucceeded when the server + // is flapping. Always 0 when the sender is configured without + // reconnect. + TotalReconnectAttempts() int64 + + // TotalReconnectsSucceeded returns the cumulative count of + // successful reconnects. Useful as a heartbeat for outage + // recovery. + TotalReconnectsSucceeded() int64 + + // TotalFramesReplayed returns the cumulative count of frames + // re-emitted on a post-reconnect catch-up — i.e. frames whose + // FSN was already on the wire before the drop. Useful for + // verifying replay actually re-issued the unacked tail. + TotalFramesReplayed() int64 + + // TotalBackpressureStalls returns the cumulative count of times + // engineAppendBlocking had to wait for the manager to free + // buffer space. One increment per blocking call, not per spin- + // park. Non-zero values mean the producer is outpacing the wire. + TotalBackpressureStalls() int64 + + // BackgroundDrainers returns a snapshot of the drainers the + // foreground sender has dispatched for orphan slot adoption. + // Returns nil when the sender was not configured with + // drain_orphans (or when no orphans were found at startup). + // Snapshots are point-in-time copies; the underlying drainer + // goroutines keep running. + BackgroundDrainers() []QwpBackgroundDrainer +} + +// QwpBackgroundDrainer is a point-in-time snapshot of one +// background-drainer goroutine, surfaced via +// QwpSender.BackgroundDrainers for ops dashboards. The fields +// mirror the Java client's BackgroundDrainer accessors. +type QwpBackgroundDrainer struct { + // Dir is the absolute path of the orphan slot directory the + // drainer adopted. + Dir string + // FramesPending is the snapshot of the slot's published FSN + // the drainer captured at startup — the upper bound the drain + // must reach before the slot is fully empty. -1 before the + // drainer has opened its engine. + FramesPending int64 + // FramesAcked is the latest server-acknowledged FSN the + // drainer has observed. -1 before the drainer's first poll. + FramesAcked int64 + // LastError is the most recent error message the drainer + // recorded, or "" if no error has been recorded. + LastError string + // Failed is true if the drainer ended in the FAILED outcome + // (exhausted reconnect budget, auth failure, recovery error) + // and dropped a .failed sentinel in the slot. + Failed bool } // Compile-time check that qwpLineSender implements QwpSender. var _ QwpSender = (*qwpLineSender)(nil) -// qwpLineSender implements LineSender for the QWP WebSocket protocol. -// In sync mode (in-flight window = 1), each Flush() encodes and -// sends one batch at a time, blocking until the server ACKs. +// qwpLineSender implements LineSender for the QWP WebSocket +// protocol. All wire I/O goes through the cursor engine + send +// loop, regardless of whether store-and-forward (sf_dir) is set — +// sf_dir picks disk-backed segments, the empty value picks +// memory-backed segments. The producer encodes a batch into the +// engine; the I/O goroutine pair drains the engine to the wire and +// processes ACKs. type qwpLineSender struct { - // transport manages the WebSocket connection. - transport qwpTransport - // tableBuffers stores one columnar buffer per active table. tableBuffers map[string]*qwpTableBuffer // currentTable is the table buffer for the current in-progress row. currentTable *qwpTableBuffer - // encoders provides double-buffered QWP message encoders for async - // mode. In sync mode, only encoders[0] is used. In async mode, the - // two encoders alternate: while one encoder's output is being sent - // over the wire, the other can encode the next batch. - encoders [2]qwpEncoder - currentEncoderIdx int - // encoderReady signals when an encoder's buffer is safe to reuse. - // A token is placed after sendMessage completes for that buffer. - // In sync mode, these are nil (not used). - encoderReady [2]chan struct{} - - // encodeInfoBuf is a reusable scratch slice for buildTableEncodeInfo, - // avoiding allocation on every flush. - encodeInfoBuf []qwpTableEncodeInfo + // encoder builds the next QWP message. The cursor engine takes + // a copy of the encoded bytes via tryAppend, so a single slot + // is enough — no double-buffering needed. + encoder qwpEncoder + + // encodeInfoBuf is a reusable scratch slice for + // buildTableEncodeInfo, avoiding allocation on every flush. + encodeInfoBuf []*qwpTableBuffer + + // dirtyTables lists the table buffers selected by Table() since the + // last full flush — i.e. the only buffers that can hold pending + // rows. buildTableEncodeInfo, resetAfterFlush, and + // recomputePendingFromBuffers iterate this set instead of the whole + // tableBuffers map, so a sender juggling hundreds of tables pays per + // flush only for the handful actually written that cycle. Truncated + // to [:0] (capacity retained) by resetAfterFlush. Producer-owned, + // like tableBuffers and encodeInfoBuf. + dirtyTables []*qwpTableBuffer // globalSymbols maps symbol strings to global IDs. globalSymbols map[string]int32 // globalSymbolList maps IDs to symbol strings (for delta dict). globalSymbolList []string - // maxSentSymbolId is the highest symbol ID ACKed by the server. - // -1 means no symbols have been sent yet. + // maxSentSymbolId is the highest symbol ID included in a frame + // appended to the cursor engine — advanced at append time, not on + // server ACK. It is the cross-flush high-water mark that + // resetAfterFlush rewinds batchMaxSymbolId to. -1 means no symbols + // appended yet. maxSentSymbolId int // batchMaxSymbolId is the highest symbol ID used in the current batch. batchMaxSymbolId int - // Schema registry (per QWP spec §16). - // Schema IDs are small integers assigned sequentially by the - // client and scoped to the connection lifetime. They are global - // across all tables; the server indexes its registry by ID. - // nextSchemaId is the next unassigned ID. - // maxSentSchemaId is the highest ID ACKed by the server; a table - // whose schemaId <= maxSentSchemaId is safe to encode in - // reference mode. - // batchMaxSchemaId is the highest schemaId used in the pending - // batch — set by buildTableEncodeInfo, promoted to - // maxSentSchemaId on ACK. - nextSchemaId int - maxSentSchemaId int - batchMaxSchemaId int - // maxSchemasPerConnection caps nextSchemaId. 0 disables the cap. - // When the cap is hit, Flush returns an error and the caller must - // close and re-open the sender. - maxSchemasPerConnection int + // Schemas are intentionally NOT tracked on the cursor wire path. + // Every frame is self-sufficient: it carries the full inline column + // definitions and the full symbol dict from id 0. There is no + // per-connection schema registry on the client side and no + // schema-change detection; the server reads the inline column + // definitions on every frame regardless. // Row state. hasTable bool @@ -185,8 +294,44 @@ type qwpLineSender struct { autoFlushRows int autoFlushInterval time.Duration autoFlushBytes int // 0 disables the byte-size trigger - flushDeadline time.Time - pendingRowCount int + // effectiveAutoFlushBytes is the per-connection clamped variant + // of autoFlushBytes. Computed from the server-advertised + // X-QWP-Max-Batch-Size on every successful connect / reconnect + // via the send loop's onTransportSwap callback: + // - autoFlushBytes <= 0 (user opted out): store 0 + // - server cap <= 0 (header absent / older): store autoFlushBytes + // - otherwise: store min(autoFlushBytes, cap*9/10) + // Read by the producer in atWithTimestamp to drive the byte-size + // auto-flush trigger; atomic so a reconnect from the I/O + // goroutine cannot race the producer's per-row trigger check. + // Initialised to autoFlushBytes in the constructors so the + // trigger fires correctly even before the first transport-swap + // callback runs. + effectiveAutoFlushBytes atomic.Int64 + // serverMaxBatchSize mirrors the just-bound transport's + // serverMaxBatchSize so the producer can apply the per-row hard + // guard (atWithTimestamp) and the flush-time defensive cap + // check (enqueueCursor) without dereferencing the loop's + // transport pointer on every call. Updated together with + // effectiveAutoFlushBytes from applyServerBatchSizeLimit; 0 + // means "no cap advertised" and both guards short-circuit. + // Mirrors Java's volatile-int serverMaxBatchSize field. + serverMaxBatchSize atomic.Int32 + // maxFrameBytes is the largest encoded frame the cursor engine's + // segments can hold (segment size minus header overhead, from + // engineMaxFrameBytes). A frame above this can never be appended, + // so it bounds two things, exactly like serverMaxBatchSize: + // - the effectiveAutoFlushBytes clamp, so the soft byte trigger + // fires before a batch can grow past what a segment holds; and + // - the flush-time hard guard in enqueueCursor, which drops an + // over-cap frame with a typed error instead of retaining it and + // re-failing forever. + // Constant for the sender's lifetime (the segment size never + // changes). 0 in the bench / hand-built test senders that have no + // engine, where both uses short-circuit. + maxFrameBytes int64 + flushDeadline time.Time + pendingRowCount int // pendingBytes tracks the approximate buffered byte total across // all table buffers. Maintained incrementally on each commitRow: @@ -204,32 +349,63 @@ type qwpLineSender struct { // Maximum length for table and column names. fileNameLimit int - // Connection and retry config. - retryTimeout time.Duration - - // syncSequence is the sequence of the next batch to send in sync - // mode (inFlightWindow == 1). First batch is 0. Incremented after - // each successful send so flushSync can recognise its own ACK and - // ignore stale ACKs for earlier batches on the same connection. - syncSequence int64 - - // Async mode (in-flight window > 1). - asyncState *qwpAsyncState + // inFlightWindow is retained as a config knob for backwards + // compat but is a no-op in cursor mode — the engine handles + // concurrency via its own backpressure model. inFlightWindow int - // closeTimeout is the time Close() waits for the async I/O - // goroutine to finish before force-cancelling. Defaults to 5s. + // cursorEngine + cursorSendLoop are set on every sender. The + // engine is memory-backed when sf_dir is empty and disk-backed + // otherwise. The send loop owns the WebSocket connection; + // reconnect is its responsibility. + cursorEngine *qwpSfCursorEngine + cursorSendLoop *qwpSfSendLoop + + // closeTimeout bounds Close()'s wait for the engine's + // ackedFsn to catch up to publishedFsn. <= 0 means fast close + // (skip the drain). Defaults to 5s. closeTimeout time.Duration - // Lifecycle. - closed bool + // drainerPool is non-nil only when the user opted into + // drain_orphans (SF mode only). Closed alongside the cursor + // engine in closeCursor. + drainerPool *qwpSfDrainerPool + + // Lifecycle. atomic so a contract-violating concurrent + // double-Close has a defined (idempotent) outcome rather than a + // data race that could double-close the engine's channels. The + // single-producer At/Flush reads are racy only under the same + // contract violation; the atomic load keeps them well-defined too. + closed atomic.Bool +} + +// newQwpLineSender creates a new QWP sender backed by an +// in-memory cursor engine. The send loop establishes the +// WebSocket connection synchronously; on failure, the constructor +// returns the dial / upgrade error directly. inFlightWindow is +// accepted for backwards compatibility but is a no-op (the cursor +// engine handles concurrency via its own backpressure model). If +// dumpWriter is non-nil, outgoing bytes are recorded across every +// transport instance the send loop creates (initial connect plus +// reconnects). +func newQwpLineSender(ctx context.Context, address string, opts qwpTransportOpts, autoFlushRows int, autoFlushInterval time.Duration, dumpWriter io.Writer, inFlightWindow ...int) (*qwpLineSender, error) { + s, err := newQwpLineSenderUnstarted(ctx, address, opts, + autoFlushRows, autoFlushInterval, dumpWriter, inFlightWindow...) + if err != nil { + return nil, err + } + s.cursorSendLoop.sendLoopStart() + return s, nil } -// newQwpLineSender creates a new QWP sender and establishes a -// WebSocket connection to the server. If inFlightWindow > 1, async -// mode is enabled with a dedicated I/O goroutine. If dumpWriter is -// non-nil, outgoing TCP bytes are recorded (see WithQwpDumpWriter). -func newQwpLineSender(ctx context.Context, address string, opts qwpTransportOpts, retryTimeout time.Duration, autoFlushRows int, autoFlushInterval time.Duration, dumpWriter io.Writer, inFlightWindow ...int) (*qwpLineSender, error) { +// newQwpLineSenderUnstarted builds the sender, engine, and loop but +// does NOT call sendLoopStart. Used by newQwpLineSenderFromConf so the +// resolver / handler / capacity from connect-string + builder options +// can be applied to the loop before it starts processing — otherwise +// the very first received frame races against the post-construction +// setters and could be classified with the default resolver / handled +// by the default handler instead of the user-configured ones. +func newQwpLineSenderUnstarted(ctx context.Context, address string, opts qwpTransportOpts, autoFlushRows int, autoFlushInterval time.Duration, dumpWriter io.Writer, inFlightWindow ...int) (*qwpLineSender, error) { window := 1 if len(inFlightWindow) > 0 && inFlightWindow[0] > 1 { window = inFlightWindow[0] @@ -240,44 +416,37 @@ func newQwpLineSender(ctx context.Context, address string, opts qwpTransportOpts globalSymbols: make(map[string]int32), maxSentSymbolId: -1, batchMaxSymbolId: -1, - nextSchemaId: 0, - maxSentSchemaId: -1, - batchMaxSchemaId: -1, - retryTimeout: retryTimeout, autoFlushRows: autoFlushRows, autoFlushInterval: autoFlushInterval, inFlightWindow: window, closeTimeout: 5 * time.Second, } - // Initial encoder buffer capacity. Sync mode uses the small 8 KB - // default. Async mode uses 1 MB: the user goroutine fills it while the - // I/O goroutine transmits the other one. The size can be further grown - // by newQwpLineSenderFromConf when autoFlushBytes is large enough to need - // max(1 MB, 2*autoFlushBytes). - initEncoderCap := qwpDefaultInitEncoderBufSize - if window > 1 { - initEncoderCap = qwpDefaultMicrobatchBufSize - } - s.encoders[0].wb.preallocate(initEncoderCap) - s.encoders[1].wb.preallocate(initEncoderCap) + s.encoder.wb.preallocate(qwpDefaultMicrobatchBufSize) - s.transport.dumpWriter = dumpWriter - if err := s.transport.connect(ctx, address, opts); err != nil { + // Build a memory-backed cursor engine. Same architecture as SF + // mode, just no disk involvement. + engine, err := qwpSfNewCursorEngine("", qwpSfDefaultMaxBytes, qwpSfDefaultMemoryMaxTotalBytes, qwpSfEngineDefaultAppendDeadline) + if err != nil { return nil, err } - - // Start async I/O goroutine if window > 1. - if window > 1 { - s.asyncState = newQwpAsyncState(window, &s.transport) - s.asyncState.start() - // Initialize double-buffered encoder ready channels. - // Both start with a token (both encoders available). - s.encoderReady[0] = make(chan struct{}, 1) - s.encoderReady[1] = make(chan struct{}, 1) - s.encoderReady[0] <- struct{}{} - s.encoderReady[1] <- struct{}{} + factory := qwpSfBuildReconnectFactory(address, opts, dumpWriter) + transport, err := factory(ctx, 0) + if err != nil { + _ = engine.engineClose() + return nil, err } - + loop := qwpSfNewSendLoop(engine, transport, factory, + qwpSfDefaultParkInterval, + qwpSfDefaultReconnectMaxDuration, + qwpSfDefaultReconnectInitialBackoff, + qwpSfDefaultReconnectMaxBackoff) + engine.engineSetReconnectStatusGetter(loop.sendLoopReconnectStatus) + s.cursorEngine = engine + s.cursorSendLoop = loop + // The memory-mode segment is the fixed qwpSfDefaultMaxBytes; record + // the largest frame it can hold so the byte-trigger clamp and the + // flush-time drop guard bound batches to it. + s.maxFrameBytes = engine.engineMaxFrameBytes() return s, nil } @@ -287,6 +456,19 @@ func (s *qwpLineSender) Table(name string) LineSender { if s.lastErr != nil { return s } + // Poll the I/O loop's terminal latch at the start of a new row so a + // HALT surfaces on the next At/AtNow without forcing the user to + // Flush first. Subsequent Symbol/*Column calls short-circuit on the + // latched s.lastErr, preserving the fluent buffer-latch pattern. + // The nil guard matches the accessor pattern in qwp_sender_cursor.go + // and keeps the bench harness (which hand-builds a sender without + // an I/O loop) working. + if s.cursorSendLoop != nil { + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + s.lastErr = err + return s + } + } if s.hasTable { s.lastErr = fmt.Errorf("qwp: table %q already set; call At() or AtNow() to finalize the row first", s.currentTable.tableName) return s @@ -311,9 +493,28 @@ func (s *qwpLineSender) Table(name string) LineSender { } s.currentTable = tb - if s.maxBufSize > 0 || s.autoFlushBytes > 0 { - s.currentTableBytesBefore = tb.approxDataSize() - } + // Track this table in the dirty set the first time it is selected + // in a flush cycle, so flush + reset visit only written tables. The + // dirty flag dedupes: repeated Table() calls for the same table (or + // the lastTable fast-path above) skip the append. resetAfterFlush + // clears the flag and empties the list. + if !tb.dirty { + tb.dirty = true + s.dirtyTables = append(s.dirtyTables, tb) + } + // Snapshot the table's buffered-byte count at row-start so both + // the auto-flush byte-size trigger (post-commit pendingBytes + // delta) and the per-row hard guard (pre-commit rowBytes delta + // vs serverMaxBatchSize) can read it. Always snapshot — the + // async-initial-connect path may flip serverMaxBatchSize from 0 + // to positive between Table() and At(), and a gated snapshot + // would leave currentTableBytesBefore stale (carrying over from + // a previous row, or 0 if never set) so the per-row guard reads + // (current size - 0) as the row's bytes and falsely rejects a + // valid row whose true delta fits. approxDataSize is O(1) and + // the int assignment doesn't allocate, so unconditional snapshot + // preserves the zero-alloc hot path. + s.currentTableBytesBefore = tb.approxDataSize() s.hasTable = true return s } @@ -629,16 +830,15 @@ func (s *qwpLineSender) Float64Array2DColumn(name string, values [][]float64) Li s.lastErr = err return s } - // Flatten. - flat := make([]float64, 0, dim0*dim1) + // Validate row regularity before reserving so the streamed write + // fills exactly the reserved payload — no intermediate flat copy. for _, row := range values { if len(row) != dim1 { s.lastErr = fmt.Errorf("qwp: irregular 2D array: row lengths differ") return s } - flat = append(flat, row...) } - col.addDoubleArray(2, []int32{int32(dim0), int32(dim1)}, flat) + col.addDoubleArray2D(dim0, dim1, values) return s } @@ -674,7 +874,8 @@ func (s *qwpLineSender) Float64Array3DColumn(name string, values [][][]float64) s.lastErr = err return s } - flat := make([]float64, 0, dim0*dim1*dim2) + // Validate shape regularity before reserving so the streamed write + // fills exactly the reserved payload — no intermediate flat copy. for _, plane := range values { if len(plane) != dim1 { s.lastErr = fmt.Errorf("qwp: irregular 3D array") @@ -685,10 +886,9 @@ func (s *qwpLineSender) Float64Array3DColumn(name string, values [][][]float64) s.lastErr = fmt.Errorf("qwp: irregular 3D array") return s } - flat = append(flat, row...) } } - col.addDoubleArray(3, []int32{int32(dim0), int32(dim1), int32(dim2)}, flat) + col.addDoubleArray3D(dim0, dim1, dim2, values) return s } @@ -751,7 +951,7 @@ func (s *qwpLineSender) AtNano(ctx context.Context, ts time.Time) error { // determines the unit used to convert ts: qwpTypeTimestamp → micros, // qwpTypeTimestampNano → nanos. func (s *qwpLineSender) atWithTimestamp(ctx context.Context, ts time.Time, typeCode qwpTypeCode) error { - if s.closed { + if s.closed.Load() { return errClosedSenderAt } @@ -761,6 +961,13 @@ func (s *qwpLineSender) atWithTimestamp(ctx context.Context, ts time.Time, typeC if s.currentTable != nil { s.currentTable.cancelRow() } + // cancelRow may have wiped the designated-TS column out + // of tb.columns / tb.columnIndex (first row of a fresh + // or just-flushed table). Drop the cache so the next row + // re-runs getOrCreateDesignatedTimestamp and re-inserts + // the column — otherwise the stale pointer satisfies the + // staleness check and the row commits without a "" column. + s.cachedDesignatedTs = nil s.hasTable = false s.currentTable = nil return err @@ -780,6 +987,7 @@ func (s *qwpLineSender) atWithTimestamp(ctx context.Context, ts time.Time, typeC col, err = s.currentTable.getOrCreateDesignatedTimestamp(typeCode) if err != nil { s.currentTable.cancelRow() + s.cachedDesignatedTs = nil s.hasTable = false s.currentTable = nil return err @@ -793,11 +1001,40 @@ func (s *qwpLineSender) atWithTimestamp(ctx context.Context, ts time.Time, typeC case qwpTypeTimestampNano: v = ts.UnixNano() default: + s.currentTable.cancelRow() + s.cachedDesignatedTs = nil + s.hasTable = false + s.currentTable = nil return fmt.Errorf("qwp: invalid designated timestamp type 0x%02X", typeCode) } col.addTimestamp(v) } + // Per-row hard guard: if THIS row's buffered bytes already + // exceed the server's wire cap, the flush would produce an + // oversize WS frame the server closes with ws-close[1009]. + // Catches the case where a single row is too big to ever ship, + // so the user sees a clear error instead of a delayed + // terminal-error from a downstream auto-flush. Checked BEFORE + // commitRow so the buffered column bytes can be discarded via + // cancelRow — prior committed rows in the batch stay intact + // and can still be flushed by the caller. The check ignores + // the null-padding bytes commitRow will add (bounded by + // numColumns * elemSize, far below any realistic cap). + // Mirrors Java QwpWebSocketSender.sendRow's pre-nextRow guard. + if cap := s.serverMaxBatchSize.Load(); cap > 0 { + rowBytes := s.currentTable.approxDataSize() - s.currentTableBytesBefore + if int64(rowBytes) > int64(cap) { + s.currentTable.cancelRow() + s.cachedDesignatedTs = nil + s.hasTable = false + s.currentTable = nil + return fmt.Errorf( + "qwp: row too large for server batch cap [rowBytes=%d, serverMaxBatchSize=%d]", + rowBytes, cap) + } + } + // Commit the row (gap-fills missing columns). s.currentTable.commitRow() @@ -814,350 +1051,218 @@ func (s *qwpLineSender) atWithTimestamp(ctx context.Context, ts time.Time, typeC s.pendingRowCount++ if s.maxBufSize > 0 || s.autoFlushBytes > 0 { + // The byte-size trigger compares against effectiveAutoFlushBytes, + // not the raw configured autoFlushBytes: the send loop's + // onTransportSwap callback clamps the threshold down to 90% + // of the server-advertised X-QWP-Max-Batch-Size on every + // connect, so the soft auto-flush fires before the encoded + // batch can exceed the server's hard cap. effectiveAutoFlushBytes + // is seeded from autoFlushBytes in the constructor; it is + // always > 0 iff the user opted in, regardless of whether a + // transport-swap callback has fired yet, so the gate on + // s.autoFlushBytes > 0 above stays sound. + effective := int(s.effectiveAutoFlushBytes.Load()) triggered := (s.maxBufSize > 0 && s.pendingBytes > s.maxBufSize) || - (s.autoFlushBytes > 0 && s.pendingBytes >= s.autoFlushBytes) + (effective > 0 && s.pendingBytes >= effective) if triggered { - if s.asyncState != nil { - return s.enqueueFlush(ctx) - } - return s.Flush(ctx) + return s.autoFlush(ctx) } } - // Check auto-flush thresholds. + // Auto-flush thresholds use enqueueCursor — never wait for + // server ACKs from the user goroutine. Explicit Flush() follows + // the same publish-only path; the send loop drains and replays + // in the background. if s.autoFlushRows > 0 && s.pendingRowCount >= s.autoFlushRows { - // In async mode, enqueue without waiting for ACKs so the - // user goroutine isn't blocked on every auto-flush. - if s.asyncState != nil { - return s.enqueueFlush(ctx) - } - return s.Flush(ctx) + return s.autoFlush(ctx) } if s.autoFlushInterval > 0 { if s.flushDeadline.IsZero() { s.flushDeadline = time.Now().Add(s.autoFlushInterval) } else if time.Now().After(s.flushDeadline) { - if s.asyncState != nil { - return s.enqueueFlush(ctx) - } - return s.Flush(ctx) + return s.autoFlush(ctx) } } return nil } -func (s *qwpLineSender) AtNow(ctx context.Context) error { - return s.At(ctx, time.Time{}) -} - -// --- LineSender interface: Flush --- - -func (s *qwpLineSender) Flush(ctx context.Context) error { - if s.closed { - return errClosedSenderFlush - } - if s.hasTable { - return errFlushWithPendingMessage - } - if s.pendingRowCount == 0 { - // In async mode, wait for any in-flight batches from - // previous auto-flushes to complete. This lets the user - // call Flush() as a barrier to confirm all data was ACKed. - if s.asyncState != nil { - return s.asyncState.waitEmpty(ctx) - } - return nil - } - - defer s.resetAfterFlush() - - if s.asyncState != nil { - return s.flushAsync(ctx) - } - return s.flushSync(ctx) -} - -// flushSync encodes all non-empty tables into a single multi-table -// QWP message, sends it, and reads ACKs until one whose sequence is -// at least the just-sent batch's sequence arrives. Earlier sequences -// are absorbed the way the Java client does in waitForAck — a defensive -// measure against stale ACKs that can otherwise be mistaken for a -// response to the wrong batch. -func (s *qwpLineSender) flushSync(ctx context.Context) error { - tables, err := s.buildTableEncodeInfo() - if err != nil { +// autoFlush dispatches an auto-flush trigger from atWithTimestamp. +// Resets pending state on success so subsequent rows hit a clean +// trigger window. Errors propagate to the user. +func (s *qwpLineSender) autoFlush(ctx context.Context) error { + if err := s.enqueueCursor(ctx); err != nil { return err } - if len(tables) == 0 { - return nil - } - - msg := s.encoders[0].encodeMultiTableWithDeltaDict( - tables, - s.globalSymbolList, - s.maxSentSymbolId, - s.batchMaxSymbolId, - ) - if err := s.transport.sendMessage(ctx, msg); err != nil { - return err - } - expected := s.syncSequence - s.syncSequence++ - - for { - status, data, err := s.transport.readAck(ctx) - if err != nil { - return err - } - seq := parseAckSequence(data) - if status != qwpStatusOK { - qErr := newQwpErrorFromAck(data) - if qErr == nil { - qErr = &QwpError{Status: status, Sequence: seq, Message: "unknown error"} - } - return qErr - } - if seq >= expected { - break - } - // Stale ACK for an earlier batch on this connection — absorb - // and keep reading. Matches Java's waitForAck. - } - - // Advance ACKed state: all schema IDs in this batch are now - // known to the server; bump the highest-ACKed symbol ID too. - if s.batchMaxSchemaId > s.maxSentSchemaId { - s.maxSentSchemaId = s.batchMaxSchemaId - } - if s.batchMaxSymbolId > s.maxSentSymbolId { - s.maxSentSymbolId = s.batchMaxSymbolId - } - + s.resetAfterFlush() return nil } -// buildTableEncodeInfo collects non-empty tables, assigning fresh -// schema IDs to any that lack one and selecting full or reference -// mode based on whether the ID has already been ACKed by the -// server. Reuses s.encodeInfoBuf to avoid allocating per flush. -// Also sets s.batchMaxSchemaId to the highest schema ID in the batch. -// Returns an error if assigning a new schema ID would exceed -// maxSchemasPerConnection (when > 0). -func (s *qwpLineSender) buildTableEncodeInfo() ([]qwpTableEncodeInfo, error) { - s.encodeInfoBuf = s.encodeInfoBuf[:0] - batchMax := s.maxSentSchemaId - for _, tb := range s.tableBuffers { - if tb.rowCount == 0 { - continue - } - // QWP wire format encodes table count as uint16. - if len(s.encodeInfoBuf) == qwpMaxTablesPerBatch { - return nil, fmt.Errorf( - "qwp: too many tables in one batch: exceeded %d", - qwpMaxTablesPerBatch, - ) - } - if tb.schemaId < 0 { - if s.maxSchemasPerConnection > 0 && s.nextSchemaId >= s.maxSchemasPerConnection { - return nil, fmt.Errorf( - "qwp: schema registry exhausted (limit %d); close and re-open the sender to reset", - s.maxSchemasPerConnection, - ) - } - tb.schemaId = s.nextSchemaId - s.nextSchemaId++ - } - mode := qwpSchemaModeFull - if tb.schemaId <= s.maxSentSchemaId { - mode = qwpSchemaModeReference - } - if tb.schemaId > batchMax { - batchMax = tb.schemaId - } - s.encodeInfoBuf = append(s.encodeInfoBuf, qwpTableEncodeInfo{ - tb: tb, - schemaMode: mode, - schemaId: tb.schemaId, - }) - } - s.batchMaxSchemaId = batchMax - return s.encodeInfoBuf, nil -} - -// flushAsync encodes all tables into a single multi-table message, -// acquires a slot, enqueues the batch, and waits for all in-flight -// batches to drain before returning. Used by the public Flush() in -// async mode. +// applyServerBatchSizeLimit refreshes effectiveAutoFlushBytes and +// serverMaxBatchSize from the cap the just-bound transport advertised +// in X-QWP-Max-Batch-Size. Registered as the send loop's +// onTransportSwap callback, so it runs after every successful connect +// — initial bind and every reconnect. A rolling upgrade can leave +// neighbouring endpoints with different caps, so the clamp is +// re-evaluated on every swap; never increase past the configured +// autoFlushBytes, never override an explicit opt-out. // -// Matches the Java client's flushPendingRows() + awaitPendingAcks(): -// schema and symbol IDs are advanced immediately after a successful -// enqueue, not after the ACK. If a later batch fails, the I/O -// goroutine stores the error into asyncState.ioErr; every subsequent -// user-facing call hits checkError() at the top of the flush path -// and returns the error. Stale cache state can therefore never -// reach the wire on a live connection. -func (s *qwpLineSender) flushAsync(ctx context.Context) error { - // Check for I/O errors before encoding. - if err := s.asyncState.checkError(); err != nil { - return err - } - - tables, err := s.buildTableEncodeInfo() - if err != nil { - return err - } - if len(tables) == 0 { - return nil - } - - // Wait for the current encoder to be available (double-buffered). - // Honour ctx here too: if the I/O goroutine is stuck in sendMessage, - // the previous batch's readySignal never fires and an unguarded - // receive would silently extend the user's Flush deadline. - encIdx := s.currentEncoderIdx - select { - case <-s.encoderReady[encIdx]: - case <-ctx.Done(): - return ctx.Err() - } - - // Encode all tables into a single multi-table message. - encoded := s.encoders[encIdx].encodeMultiTableWithDeltaDict( - tables, - s.globalSymbolList, - s.maxSentSymbolId, - s.batchMaxSymbolId, - ) - - // Acquire a slot in the in-flight window. - if err := s.asyncState.acquireSlot(ctx); err != nil { - // Return the encoder token since we won't enqueue. - s.encoderReady[encIdx] <- struct{}{} - return err - } - - // Enqueue the batch with the encoder's ready signal. - // No copy needed — the ioLoop signals encoderReady after - // sendMessage, at which point the buffer is safe to reuse. - batch := qwpAsyncBatch{ - data: encoded, - readySignal: s.encoderReady[encIdx], - } - select { - case s.asyncState.sendCh <- batch: - case <-ctx.Done(): - s.encoderReady[encIdx] <- struct{}{} - s.asyncState.releaseSlot() - return ctx.Err() +// Resolution (mirrors Java QwpWebSocketSender.applyServerBatchSizeLimit): +// - s.autoFlushBytes <= 0: the user disabled byte-size auto-flush; +// keep it disabled even when the server advertises a cap. +// - transport == nil OR cap <= 0: the server did not advertise a +// cap (older build or async-pending initial connect); the +// configured autoFlushBytes is kept verbatim. +// - otherwise: store min(autoFlushBytes, cap*9/10). The 10% +// headroom covers schema + dict-delta encoding overhead the +// soft trigger does not see — without it, an at-the-limit +// auto-flush could still emit a frame the server closes with +// ws-close[1009]. +// +// Also mirrors the raw cap onto s.serverMaxBatchSize so the per-row +// hard guard in atWithTimestamp and the flush-time defensive cap +// check in enqueueCursor can sample it cheaply without dereferencing +// the loop's transport pointer. Always-update (independent of the +// opt-out branch) so the guards fire against the freshly-advertised +// value even when the user opted out of the soft trigger. +// +// Safe to call from any goroutine: atomic stores on both fields. +// Cheap; no allocations. +func (s *qwpLineSender) applyServerBatchSizeLimit(t *qwpTransport) { + var cap int32 + if t != nil { + cap = t.serverMaxBatchSize + } + s.serverMaxBatchSize.Store(cap) + if s.autoFlushBytes <= 0 { + s.effectiveAutoFlushBytes.Store(0) + return } - - // Swap to the other encoder for the next flush. - s.currentEncoderIdx = 1 - s.currentEncoderIdx - - // Advance highest-sent schema and symbol IDs immediately after - // enqueue — same semantics as Java's flushPendingRows. If a - // subsequent ACK fails, asyncState.ioErr poisons the sender. - if s.batchMaxSchemaId > s.maxSentSchemaId { - s.maxSentSchemaId = s.batchMaxSchemaId + effective := int64(s.autoFlushBytes) + // Clamp to 90% of the server-advertised cap. The 10% headroom + // covers schema + dict-delta encoding overhead the soft trigger + // does not see. cap <= 0 means the server advertised none (older + // build or async-pending initial connect): keep the configured + // value for this term. + if cap > 0 { + if safe := int64(cap) * 9 / 10; safe < effective { + effective = safe + } } - if s.batchMaxSymbolId > s.maxSentSymbolId { - s.maxSentSymbolId = s.batchMaxSymbolId + // Clamp to 90% of the per-segment frame cap as well. A frame larger + // than a single segment can hold can never be appended to the cursor + // engine, so the soft trigger must fire before a batch crosses it — + // independent of the server cap. This is what keeps the shipped + // defaults (8 MiB trigger over a 4 MiB segment) from self-wedging. + // Fixed for the sender's lifetime, so re-applying it on every + // transport swap is a no-op past the first. maxFrameBytes is 0 in + // the hand-built test senders that exercise the server-cap table in + // isolation, so this term short-circuits there. + if s.maxFrameBytes > 0 { + if safe := s.maxFrameBytes * 9 / 10; safe < effective { + effective = safe + } } - - // Drain all in-flight batches before returning (Flush semantics). - return s.asyncState.waitEmpty(ctx) + s.effectiveAutoFlushBytes.Store(effective) } -// enqueueFlush encodes all pending table buffers and enqueues them -// for the I/O goroutine without waiting for ACKs. This is the -// auto-flush path for async mode — At() returns promptly instead of -// blocking on a full round-trip. Schema and symbol caches are -// updated optimistically; if the I/O goroutine later fails, ioErr -// is set and all subsequent operations return that error (the -// sender is terminal, so stale cache entries can never reach the -// wire). Mirrors the Java client's flushPendingRows(). -func (s *qwpLineSender) enqueueFlush(ctx context.Context) error { - if s.pendingRowCount == 0 { - return nil - } - - // Check for I/O errors before encoding. - if err := s.asyncState.checkError(); err != nil { - return err - } - - tables, err := s.buildTableEncodeInfo() - if err != nil { - return err - } - if len(tables) == 0 { - s.resetAfterFlush() - return nil - } - - // Wait for the current encoder to be available (double-buffered). - // Ctx-aware for the same reason as flushAsync: a stuck I/O goroutine - // must not extend the caller's deadline. - encIdx := s.currentEncoderIdx - select { - case <-s.encoderReady[encIdx]: - case <-ctx.Done(): - return ctx.Err() - } - - // Encode all tables into a single multi-table message. - encoded := s.encoders[encIdx].encodeMultiTableWithDeltaDict( - tables, - s.globalSymbolList, - s.maxSentSymbolId, - s.batchMaxSymbolId, - ) - - if err := s.asyncState.acquireSlot(ctx); err != nil { - s.encoderReady[encIdx] <- struct{}{} - return err - } +func (s *qwpLineSender) AtNow(ctx context.Context) error { + return s.At(ctx, time.Time{}) +} - // No copy needed — the ioLoop signals encoderReady after - // sendMessage, at which point the buffer is safe to reuse. - batch := qwpAsyncBatch{ - data: encoded, - readySignal: s.encoderReady[encIdx], - } - select { - case s.asyncState.sendCh <- batch: - case <-ctx.Done(): - s.encoderReady[encIdx] <- struct{}{} - s.asyncState.releaseSlot() - return ctx.Err() - } +// --- LineSender interface: Flush --- - // Swap to the other encoder for the next flush. - s.currentEncoderIdx = 1 - s.currentEncoderIdx +func (s *qwpLineSender) Flush(ctx context.Context) error { + _, err := s.FlushAndGetSequence(ctx) + return err +} - // Optimistic cache: if the batch fails, ioErr prevents further - // operations so stale cache entries are harmless. - if s.batchMaxSchemaId > s.maxSentSchemaId { - s.maxSentSchemaId = s.batchMaxSchemaId +// FlushAndGetSequence implements QwpSender.FlushAndGetSequence. +// Flushes any pending rows and returns the published FSN — the +// upper bound on any SenderError.ToFsn that could surface for this +// batch. +// +// It does NOT wait for the server ACK (Java decision #1 in +// design/qwp-cursor-durability.md — "flush() never waits for ACK; +// ACKs are async"): it returns once the batch is published into the +// cursor engine (in-RAM for memory mode, on-disk for SF) and the +// send loop delivers + replays it in the background. Callers +// wanting server-ACK confirmation pair the returned FSN with +// AwaitAckedFsn. +func (s *qwpLineSender) FlushAndGetSequence(ctx context.Context) (int64, error) { + if s.closed.Load() { + return -1, errClosedSenderFlush + } + if s.calledFromErrorHandler() { + // Flush() invoked from inside a SenderErrorHandler runs on the + // dispatcher goroutine. The handler's documented use of Flush() + // is to surface the latched terminal error promptly (it is + // latched before the handler runs). We must not read or flush + // producer-owned state (hasTable / pendingRowCount / tableBuffers + // / the encoder) from this goroutine — that races the producer, + // the C3 producer-state hazard. Surface any latched error and + // return the published FSN without touching producer state. + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return -1, err + } + return s.cursorEngine.enginePublishedFsn(), nil } - if s.batchMaxSymbolId > s.maxSentSymbolId { - s.maxSentSymbolId = s.batchMaxSymbolId + if s.hasTable { + return -1, errFlushWithPendingMessage } - - s.resetAfterFlush() - return nil + if s.pendingRowCount == 0 { + // Nothing to encode, so skip straight to flushCursor's tail: + // surface any terminal I/O error the loop has recorded (so + // producers don't keep silently buffering into a dead engine), + // then return the current published FSN. Same no-ACK-wait + // contract as the pending-rows path below — this branch only + // elides the empty encode/append. + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return -1, err + } + return s.cursorEngine.enginePublishedFsn(), nil + } + if err := s.flushCursor(ctx); err != nil { + // flushCursor resets the table buffers as soon as the enqueue + // succeeds (the rows are sealed in a segment), so an error here + // is one of two already-handled cases: + // - enqueueCursor failed before sealing — ring full + wire + // stalled past the append deadline, or ctx cancelled: the + // rows were never persisted and are RETAINED for the next + // flush attempt (or, in SF mode, recoverable by reopening + // the same sf_dir). Mirrors Java's flushPendingRows() + // reset-after-seal contract. + // - enqueueCursor sealed the rows but the eager error check + // then surfaced a HALT latched by a previous batch: the + // buffers were already reset inside flushCursor, so the + // published rows are not double-written when the user + // re-sends after the documented close+rebuild recovery. + return -1, err + } + return s.cursorEngine.enginePublishedFsn(), nil } -// resetAfterFlush clears all table buffers and resets counters. +// resetAfterFlush clears the table buffers touched this cycle and +// resets counters. Only dirtyTables can hold rows, so resetting the +// rest of the tableBuffers map would be wasted work; the dirty flag is +// cleared here (the one place that empties the list) so the next +// Table() re-lists the buffer. func (s *qwpLineSender) resetAfterFlush() { - for _, tb := range s.tableBuffers { + for _, tb := range s.dirtyTables { tb.reset() + tb.dirty = false } + s.dirtyTables = s.dirtyTables[:0] s.pendingRowCount = 0 s.pendingBytes = 0 s.batchMaxSymbolId = s.maxSentSymbolId + // Defense in depth: tb.reset() keeps the column structure but + // sets committedColumnCount=0, so a post-flush cancelRow would + // wipe the designated-TS column out of tb.columns. Drop the + // cache here so the first row after a flush always re-runs + // getOrCreateDesignatedTimestamp. + s.cachedDesignatedTs = nil // Refresh flush deadline. if s.autoFlushInterval > 0 { @@ -1170,62 +1275,14 @@ func (s *qwpLineSender) resetAfterFlush() { // --- LineSender interface: Close --- func (s *qwpLineSender) Close(ctx context.Context) error { - if s.closed { + if !s.closed.CompareAndSwap(false, true) { return errDoubleSenderClose } - - s.closed = true - - var flushErr error - if s.asyncState != nil { - // Async close: enqueue pending rows non-blocking, then - // stop the I/O goroutine (cancel context + close channel - // + wait). For a guaranteed graceful flush, call Flush() - // before Close(). - if s.hasTable { - if s.currentTable != nil { - s.currentTable.cancelRow() - } - s.hasTable = false - s.currentTable = nil - } - if s.pendingRowCount > 0 { - flushErr = s.enqueueFlush(ctx) - } - s.asyncState.stop(s.closeTimeout) - if flushErr == nil { - flushErr = s.asyncState.checkError() - } - } else { - flushErr = s.flush0(ctx) - } - - closeErr := s.transport.close(ctx) - - if flushErr != nil { - return flushErr - } - return closeErr -} - -// flush0 is the internal flush used by Close in sync mode. The async -// Close path uses enqueueFlush + stop() directly, so this function -// is only called when asyncState == nil. -func (s *qwpLineSender) flush0(ctx context.Context) error { - if s.hasTable { - // Drop the pending row silently on close. - if s.currentTable != nil { - s.currentTable.cancelRow() - } - s.hasTable = false - s.currentTable = nil - } - if s.pendingRowCount == 0 { - return nil - } - - defer s.resetAfterFlush() - return s.flushSync(ctx) + // All wire I/O goes through the cursor engine + send loop, + // regardless of whether sf_dir was set. closeCursor drains + // (up to closeTimeout), stops the loop, closes the engine, + // and tears down the orphan-drainer pool if one was started. + return s.closeCursor(ctx) } // --- QwpSender interface: extended column types --- @@ -1479,15 +1536,15 @@ func (s *qwpLineSender) Int64Array2DColumn(name string, values [][]int64) QwpSen s.lastErr = err return s } - flat := make([]int64, 0, dim0*dim1) + // Validate row regularity before reserving so the streamed write + // fills exactly the reserved payload — no intermediate flat copy. for _, row := range values { if len(row) != dim1 { s.lastErr = fmt.Errorf("qwp: irregular 2D array: row lengths differ") return s } - flat = append(flat, row...) } - col.addLongArray(2, []int32{int32(dim0), int32(dim1)}, flat) + col.addLongArray2D(dim0, dim1, values) return s } @@ -1523,7 +1580,8 @@ func (s *qwpLineSender) Int64Array3DColumn(name string, values [][][]int64) QwpS s.lastErr = err return s } - flat := make([]int64, 0, dim0*dim1*dim2) + // Validate shape regularity before reserving so the streamed write + // fills exactly the reserved payload — no intermediate flat copy. for _, plane := range values { if len(plane) != dim1 { s.lastErr = fmt.Errorf("qwp: irregular 3D array") @@ -1534,9 +1592,8 @@ func (s *qwpLineSender) Int64Array3DColumn(name string, values [][][]int64) QwpS s.lastErr = fmt.Errorf("qwp: irregular 3D array") return s } - flat = append(flat, row...) } } - col.addLongArray(3, []int32{int32(dim0), int32(dim1), int32(dim2)}, flat) + col.addLongArray3D(dim0, dim1, dim2, values) return s } diff --git a/qwp_sender_async.go b/qwp_sender_async.go deleted file mode 100644 index 8d90ad84..00000000 --- a/qwp_sender_async.go +++ /dev/null @@ -1,459 +0,0 @@ -/*+***************************************************************************** - * ___ _ ____ ____ - * / _ \ _ _ ___ ___| |_| _ \| __ ) - * | | | | | | |/ _ \/ __| __| | | | _ \ - * | |_| | |_| | __/\__ \ |_| |_| | |_) | - * \__\_\\__,_|\___||___/\__|____/|____/ - * - * Copyright (c) 2014-2019 Appsicle - * Copyright (c) 2019-2026 QuestDB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ******************************************************************************/ - -package questdb - -import ( - "context" - "fmt" - "sync" - "time" -) - -// qwpAsyncState manages the in-flight window and I/O goroutines for -// async QWP mode (in-flight window > 1). It coordinates between the -// user goroutine (which encodes and enqueues batches) and two I/O -// goroutines: senderLoop transmits batches, receiverLoop processes -// ACKs in parallel so multiple batches can be in flight on the wire -// at once (matches the Java client's sliding-window design). -// -// qwpAsyncBatch carries an encoded batch payload and a signal channel -// to mark the encoder's buffer as reusable after the data is written -// to the socket. -type qwpAsyncBatch struct { - data []byte - readySignal chan<- struct{} // signaled after sendMessage completes -} - -type qwpAsyncState struct { - // sendCh carries encoded batch payloads from the user goroutine - // to senderLoop. Buffered to decouple encoding from sending. - sendCh chan qwpAsyncBatch - - // mu protects inFlightCount, nextSequence, ackedSequence, - // senderDone, lastSentSequence, ioErr, and stopped. - mu sync.Mutex - cond *sync.Cond - - // inFlightCount is the number of batches enqueued on sendCh or - // sent but not yet ACKed. Incremented in acquireSlot; decremented - // by releaseSlot (enqueue-cancelled or send-failed batches) and - // by releaseSlotsUpTo (ACK-based cumulative release). - inFlightCount int - inFlightMax int - - // nextSequence is the sequence number that will be assigned to - // the next attempted send. First batch is 0. Incremented by - // senderLoop before sendMessage so a concurrent ACK for an - // in-flight batch never satisfies seq >= nextSequence. - nextSequence int64 - // ackedSequence is the highest cumulative sequence acknowledged - // by the server, or -1 if none. Updated only by receiverLoop. - // The -1 sentinel matches Java's InFlightWindow.highestAcked and - // disambiguates "no ACK yet" from "sequence 0 ACKed" — without it, - // a server that starts its sequence counter at 0 would look like - // a stale ACK and never release the first slot. - ackedSequence int64 - - // lastSentSequence is the sequence of the last batch actually - // transmitted, or -1 if none. Set by senderLoop as it exits - // (sendCh closed and drained). Once senderDone is true, - // receiverLoop exits when ackedSequence >= lastSentSequence. - lastSentSequence int64 - senderDone bool - - // ioErr is the first error from either I/O goroutine. Once set, - // all blocking operations return this error. - ioErr error - - // stopped is set to true after both I/O goroutines have exited. - stopped bool - - // doneSender is closed when senderLoop exits; doneReceiver is - // closed when receiverLoop exits. - doneSender chan struct{} - doneReceiver chan struct{} - - // wg tracks both I/O goroutines for clean shutdown. - wg sync.WaitGroup - - // ctx is a cancellable context used by both goroutines for all - // WebSocket operations. Cancelled by stop() or senderLoop (on - // clean drain) to unblock sendMessage/readAck if the server - // becomes unresponsive. - ctx context.Context - cancel context.CancelFunc - - // transport is the WebSocket connection shared by both goroutines. - // senderLoop and receiverLoop are single-writer / single-reader - // on the connection respectively. - transport *qwpTransport -} - -// newQwpAsyncState creates async state with the given in-flight window -// size. The send channel is buffered to the window size so the user -// goroutine can enqueue without blocking until the window is full. -func newQwpAsyncState(maxWindow int, transport *qwpTransport) *qwpAsyncState { - ctx, cancel := context.WithCancel(context.Background()) - a := &qwpAsyncState{ - sendCh: make(chan qwpAsyncBatch, maxWindow), - inFlightMax: maxWindow, - ackedSequence: -1, - lastSentSequence: -1, - doneSender: make(chan struct{}), - doneReceiver: make(chan struct{}), - ctx: ctx, - cancel: cancel, - transport: transport, - } - a.cond = sync.NewCond(&a.mu) - return a -} - -// acquireSlot blocks until there is space in the in-flight window. -// Returns ctx.Err() if ctx is cancelled during the wait, or the I/O -// goroutine's error if it has failed. Mirrors the Java client's -// InFlightWindow.addInFlight, which checks Thread.interrupt() during -// its park-spin. -func (a *qwpAsyncState) acquireSlot(ctx context.Context) error { - a.mu.Lock() - defer a.mu.Unlock() - - // Watcher goroutine is spawned lazily on the first cond.Wait so - // the fast path (slot immediately available) pays no overhead. - var watchCancel chan struct{} - defer func() { - if watchCancel != nil { - close(watchCancel) - } - }() - - for a.inFlightCount >= a.inFlightMax { - if a.ioErr != nil { - return a.ioErr - } - if a.stopped { - return fmt.Errorf("qwp: async I/O goroutine stopped") - } - if err := ctx.Err(); err != nil { - return err - } - if watchCancel == nil { - watchCancel = a.startCtxWatcher(ctx) - } - a.cond.Wait() - } - - if a.ioErr != nil { - return a.ioErr - } - if a.stopped { - return fmt.Errorf("qwp: async I/O goroutine stopped") - } - if err := ctx.Err(); err != nil { - return err - } - - a.inFlightCount++ - return nil -} - -// startCtxWatcher launches a goroutine that Broadcasts on the cond -// when ctx is cancelled, so a caller in cond.Wait() wakes up and -// can return ctx.Err(). The returned channel stops the watcher — -// close it after exiting the wait loop. -func (a *qwpAsyncState) startCtxWatcher(ctx context.Context) chan struct{} { - cancelWatch := make(chan struct{}) - go func() { - select { - case <-ctx.Done(): - a.mu.Lock() - a.cond.Broadcast() - a.mu.Unlock() - case <-cancelWatch: - } - }() - return cancelWatch -} - -// releaseSlot decrements inFlightCount by one and wakes a waiter. -// Used when a batch never reaches the wire: either the user goroutine -// cancelled its enqueue after acquireSlot, or senderLoop drained a -// batch without sending it (send failed or shutting down). -func (a *qwpAsyncState) releaseSlot() { - a.mu.Lock() - defer a.mu.Unlock() - - if a.inFlightCount > 0 { - a.inFlightCount-- - } - a.cond.Signal() -} - -// releaseSlotsUpTo processes a cumulative ACK: advances ackedSequence -// to the given sequence and releases (delta) slots, where delta counts -// the batches newly acknowledged. Returns a protocol error if the -// server acknowledged more batches than were sent. Called only by -// receiverLoop. -func (a *qwpAsyncState) releaseSlotsUpTo(seq int64) error { - a.mu.Lock() - defer a.mu.Unlock() - - if seq <= a.ackedSequence { - // Stale or duplicate ACK — Java absorbs and keeps reading. - return nil - } - if seq >= a.nextSequence { - return fmt.Errorf( - "qwp: server acknowledged sequence %d but only %d batches sent", - seq, a.nextSequence, - ) - } - delta := int(seq - a.ackedSequence) - a.ackedSequence = seq - a.inFlightCount -= delta - a.cond.Broadcast() - return nil -} - -// setError records the first I/O error and wakes all waiters. -// Subsequent calls are no-ops (first error wins). -func (a *qwpAsyncState) setError(err error) { - a.mu.Lock() - defer a.mu.Unlock() - - if a.ioErr == nil { - a.ioErr = err - } - a.cond.Broadcast() -} - -// checkError returns the I/O error if one has been set. -func (a *qwpAsyncState) checkError() error { - a.mu.Lock() - defer a.mu.Unlock() - return a.ioErr -} - -// waitEmpty blocks until all in-flight batches have been ACKed. -// Returns ctx.Err() if ctx is cancelled during the wait, or the I/O -// goroutine's error if it fails before draining. -func (a *qwpAsyncState) waitEmpty(ctx context.Context) error { - a.mu.Lock() - defer a.mu.Unlock() - - var watchCancel chan struct{} - defer func() { - if watchCancel != nil { - close(watchCancel) - } - }() - - for a.inFlightCount > 0 { - if a.ioErr != nil { - return a.ioErr - } - if a.stopped { - return fmt.Errorf("qwp: async I/O goroutine stopped with %d batches in flight", a.inFlightCount) - } - if err := ctx.Err(); err != nil { - return err - } - if watchCancel == nil { - watchCancel = a.startCtxWatcher(ctx) - } - a.cond.Wait() - } - - return a.ioErr -} - -// markStopped signals that both I/O goroutines have exited. -func (a *qwpAsyncState) markStopped() { - a.mu.Lock() - defer a.mu.Unlock() - - a.stopped = true - a.cond.Broadcast() -} - -// senderLoop consumes batches from sendCh, transmits them over the -// WebSocket, and assigns sequence numbers. It never blocks on ACKs; -// that is receiverLoop's job. Exits when sendCh is closed. -func (a *qwpAsyncState) senderLoop() { - defer a.wg.Done() - defer close(a.doneSender) - - for batch := range a.sendCh { - a.mu.Lock() - drop := a.ioErr != nil || a.ctx.Err() != nil - if !drop { - // Reserve the sequence number before the message hits the - // wire so receiverLoop can never observe an ACK whose - // sequence is >= nextSequence for an in-flight batch. - a.nextSequence++ - } - a.mu.Unlock() - - if drop { - // Already failing or shutting down — drain without sending. - if batch.readySignal != nil { - batch.readySignal <- struct{}{} - } - a.releaseSlot() - continue - } - - if err := a.transport.sendMessage(a.ctx, batch.data); err != nil { - // Signal encoder buffer as reusable so the user goroutine - // does not deadlock on encoder handoff. - if batch.readySignal != nil { - batch.readySignal <- struct{}{} - } - a.releaseSlot() - a.setError(fmt.Errorf("qwp: async send failed: %w", err)) - continue - } - - // Send succeeded — the encoder buffer is safe to reuse. - if batch.readySignal != nil { - batch.readySignal <- struct{}{} - } - } - - // sendCh has been closed and drained. Record the highest - // sequence actually sent so receiverLoop can decide when to - // exit, and wake it if it is currently blocked in readAck but - // has no more ACKs to process. If nothing was sent at all, - // lastSentSequence stays -1 and the receiver exits immediately. - a.mu.Lock() - a.lastSentSequence = a.nextSequence - 1 - a.senderDone = true - caughtUp := a.ackedSequence >= a.lastSentSequence - a.cond.Broadcast() - a.mu.Unlock() - - if caughtUp { - a.cancel() - } -} - -// receiverLoop reads ACKs from the WebSocket and releases in-flight -// slots. Matches Java's cumulative-ACK semantics: a single ACK with -// sequence N releases (N - ackedSequence) slots. -// -// Exits when (a) senderLoop has finished AND ackedSequence has caught -// up to lastSentSequence, (b) ioErr has been set by either loop, or -// (c) readAck returns an error because ctx was cancelled. -func (a *qwpAsyncState) receiverLoop() { - defer a.wg.Done() - defer close(a.doneReceiver) - - for { - a.mu.Lock() - if a.ioErr != nil { - a.mu.Unlock() - return - } - if a.senderDone && a.ackedSequence >= a.lastSentSequence { - a.mu.Unlock() - return - } - a.mu.Unlock() - - status, data, err := a.transport.readAck(a.ctx) - if err != nil { - // Distinguish a clean shutdown (ctx cancelled once the - // sender has drained and the receiver has nothing more - // to wait for) from a real I/O failure. - a.mu.Lock() - draining := a.senderDone && a.ackedSequence >= a.lastSentSequence - a.mu.Unlock() - if !draining { - a.setError(fmt.Errorf("qwp: async ack read failed: %w", err)) - } - return - } - - seq := parseAckSequence(data) - - if status != qwpStatusOK { - qErr := newQwpErrorFromAck(data) - if qErr == nil { - qErr = &QwpError{Status: status, Sequence: seq, Message: "unknown error"} - } - a.setError(qErr) - return - } - - if err := a.releaseSlotsUpTo(seq); err != nil { - a.setError(err) - return - } - } -} - -// start launches the sender and receiver goroutines. -func (a *qwpAsyncState) start() { - a.wg.Add(2) - go a.senderLoop() - go a.receiverLoop() -} - -// stop closes the send channel and waits for both I/O goroutines to -// exit. If they do not finish within the grace period (e.g., stuck -// on an unresponsive server), the I/O context is cancelled to force -// them out. Must be called exactly once. -func (a *qwpAsyncState) stop(gracePeriod time.Duration) { - close(a.sendCh) - - // Wait for senderLoop to drain and exit, then for receiverLoop - // to catch up and exit. senderLoop self-cancels the I/O context - // if it observes the receiver already caught up, so in the - // normal case we do not have to force anything. - timer := time.NewTimer(gracePeriod) - defer timer.Stop() - - select { - case <-a.doneSender: - case <-timer.C: - a.cancel() - <-a.doneSender - <-a.doneReceiver - a.wg.Wait() - a.markStopped() - return - } - - select { - case <-a.doneReceiver: - case <-timer.C: - a.cancel() - <-a.doneReceiver - } - - a.wg.Wait() - a.cancel() // idempotent; ensures context is always cleaned up - a.markStopped() -} diff --git a/qwp_sender_async_test.go b/qwp_sender_async_test.go deleted file mode 100644 index d089785d..00000000 --- a/qwp_sender_async_test.go +++ /dev/null @@ -1,828 +0,0 @@ -/*+***************************************************************************** - * ___ _ ____ ____ - * / _ \ _ _ ___ ___| |_| _ \| __ ) - * | | | | | | |/ _ \/ __| __| | | | _ \ - * | |_| | |_| | __/\__ \ |_| |_| | |_) | - * \__\_\\__,_|\___||___/\__|____/|____/ - * - * Copyright (c) 2014-2019 Appsicle - * Copyright (c) 2019-2026 QuestDB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ******************************************************************************/ - -package questdb - -import ( - "context" - "fmt" - "net/http" - "net/http/httptest" - "strings" - "sync" - "testing" - "time" - - "github.com/coder/websocket" -) - -func TestQwpAsyncAcquireAndRelease(t *testing.T) { - a := newQwpAsyncState(2, nil) - - // Should acquire 2 slots without blocking. - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquire 1: %v", err) - } - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquire 2: %v", err) - } - - a.mu.Lock() - if a.inFlightCount != 2 { - t.Fatalf("inFlightCount = %d, want 2", a.inFlightCount) - } - a.mu.Unlock() - - // Release one slot. - a.releaseSlot() - - a.mu.Lock() - if a.inFlightCount != 1 { - t.Fatalf("inFlightCount after release = %d, want 1", a.inFlightCount) - } - a.mu.Unlock() - - // Should be able to acquire one more. - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquire 3: %v", err) - } -} - -func TestQwpAsyncAcquireBlocksAtMax(t *testing.T) { - a := newQwpAsyncState(1, nil) - - // Fill the window. - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquire: %v", err) - } - - // Second acquire should block. Use a goroutine to test. - acquired := make(chan struct{}) - go func() { - a.acquireSlot(context.Background()) - close(acquired) - }() - - // Wait a bit — should NOT have acquired. - select { - case <-acquired: - t.Fatal("acquire should have blocked but didn't") - case <-time.After(50 * time.Millisecond): - // Good, it's blocked. - } - - // Release the slot — should unblock. - a.releaseSlot() - - select { - case <-acquired: - // Good, unblocked. - case <-time.After(time.Second): - t.Fatal("acquire did not unblock after release") - } -} - -func TestQwpAsyncSetErrorUnblocksAcquire(t *testing.T) { - a := newQwpAsyncState(1, nil) - - // Fill the window. - a.acquireSlot(context.Background()) - - errCh := make(chan error, 1) - go func() { - errCh <- a.acquireSlot(context.Background()) - }() - - // Wait for the goroutine to be blocked. - time.Sleep(20 * time.Millisecond) - - // Set an error — should unblock with error. - testErr := fmt.Errorf("test I/O failure") - a.setError(testErr) - - select { - case err := <-errCh: - if err != testErr { - t.Fatalf("acquire returned wrong error: %v", err) - } - case <-time.After(time.Second): - t.Fatal("acquire did not unblock after setError") - } -} - -func TestQwpAsyncAcquireUnblocksOnCtxCancel(t *testing.T) { - a := newQwpAsyncState(1, nil) - - // Fill the window. - a.acquireSlot(context.Background()) - - ctx, cancel := context.WithCancel(context.Background()) - errCh := make(chan error, 1) - go func() { - errCh <- a.acquireSlot(ctx) - }() - - // Wait for the goroutine to be parked on the cond. - time.Sleep(20 * time.Millisecond) - - // Cancel the context — should unblock with ctx.Err(). - cancel() - - select { - case err := <-errCh: - if err != context.Canceled { - t.Fatalf("acquire err = %v, want context.Canceled", err) - } - case <-time.After(time.Second): - t.Fatal("acquire did not unblock after ctx cancel") - } - - // Slot count must still reflect the one acquired slot — the - // cancelled caller must not claim a slot. - a.mu.Lock() - if a.inFlightCount != 1 { - t.Fatalf("inFlightCount = %d, want 1", a.inFlightCount) - } - a.mu.Unlock() -} - -func TestQwpAsyncAcquireAlreadyCancelled(t *testing.T) { - a := newQwpAsyncState(1, nil) - a.acquireSlot(context.Background()) - - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - if err := a.acquireSlot(ctx); err != context.Canceled { - t.Fatalf("acquireSlot with cancelled ctx = %v, want context.Canceled", err) - } -} - -func TestQwpAsyncWaitEmptyUnblocksOnCtxCancel(t *testing.T) { - a := newQwpAsyncState(2, nil) - a.acquireSlot(context.Background()) - - ctx, cancel := context.WithCancel(context.Background()) - doneCh := make(chan error, 1) - go func() { - doneCh <- a.waitEmpty(ctx) - }() - - time.Sleep(20 * time.Millisecond) - cancel() - - select { - case err := <-doneCh: - if err != context.Canceled { - t.Fatalf("waitEmpty err = %v, want context.Canceled", err) - } - case <-time.After(time.Second): - t.Fatal("waitEmpty did not unblock after ctx cancel") - } -} - -func TestQwpAsyncWaitEmpty(t *testing.T) { - a := newQwpAsyncState(3, nil) - - // Acquire 3 slots. - a.acquireSlot(context.Background()) - a.acquireSlot(context.Background()) - a.acquireSlot(context.Background()) - - doneCh := make(chan error, 1) - go func() { - doneCh <- a.waitEmpty(context.Background()) - }() - - // Should still be waiting. - select { - case <-doneCh: - t.Fatal("waitEmpty should be blocking") - case <-time.After(50 * time.Millisecond): - } - - // Release 2 — still 1 in flight. - a.releaseSlot() - a.releaseSlot() - - select { - case <-doneCh: - t.Fatal("waitEmpty should still be blocking with 1 in flight") - case <-time.After(50 * time.Millisecond): - } - - // Release last. - a.releaseSlot() - - select { - case err := <-doneCh: - if err != nil { - t.Fatalf("waitEmpty: %v", err) - } - case <-time.After(time.Second): - t.Fatal("waitEmpty did not return after all released") - } -} - -func TestQwpAsyncWaitEmptyWithError(t *testing.T) { - a := newQwpAsyncState(2, nil) - - a.acquireSlot(context.Background()) - - doneCh := make(chan error, 1) - go func() { - doneCh <- a.waitEmpty(context.Background()) - }() - - time.Sleep(20 * time.Millisecond) - - testErr := fmt.Errorf("transport error") - a.setError(testErr) - - select { - case err := <-doneCh: - if err != testErr { - t.Fatalf("waitEmpty returned wrong error: %v", err) - } - case <-time.After(time.Second): - t.Fatal("waitEmpty did not return after setError") - } -} - -func TestQwpAsyncCheckError(t *testing.T) { - a := newQwpAsyncState(2, nil) - - if err := a.checkError(); err != nil { - t.Fatalf("checkError on fresh state: %v", err) - } - - testErr := fmt.Errorf("some error") - a.setError(testErr) - - if err := a.checkError(); err != testErr { - t.Fatalf("checkError = %v, want %v", err, testErr) - } - - // Second setError should not overwrite. - a.setError(fmt.Errorf("second error")) - if err := a.checkError(); err != testErr { - t.Fatalf("checkError after second setError = %v, want %v", err, testErr) - } -} - -func TestQwpAsyncMarkStopped(t *testing.T) { - a := newQwpAsyncState(1, nil) - - // Fill window. - a.acquireSlot(context.Background()) - - errCh := make(chan error, 1) - go func() { - errCh <- a.acquireSlot(context.Background()) - }() - - time.Sleep(20 * time.Millisecond) - a.markStopped() - - select { - case err := <-errCh: - if err == nil { - t.Fatal("expected error after markStopped") - } - case <-time.After(time.Second): - t.Fatal("acquire did not unblock after markStopped") - } -} - -func TestQwpAsyncIoLoopSendAndAck(t *testing.T) { - // Mock WebSocket server that ACKs each message with an - // incrementing cumulative sequence (0-indexed, matches Java). - var received [][]byte - var mu sync.Mutex - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - var seq int64 - for { - _, data, err := conn.Read(context.Background()) - if err != nil { - return - } - mu.Lock() - received = append(received, data) - mu.Unlock() - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) - seq++ - } - })) - defer srv.Close() - - // Create transport and connect. - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer transport.close(context.Background()) - - // Create async state with window=2. - a := newQwpAsyncState(2, &transport) - a.start() - - // Send 3 batches through the I/O loop. - for i := 0; i < 3; i++ { - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquireSlot %d: %v", i, err) - } - a.sendCh <- qwpAsyncBatch{data: []byte{byte(i + 1), byte(i + 2)}} - } - - // Wait for all in-flight to be ACKed. - if err := a.waitEmpty(context.Background()); err != nil { - t.Fatalf("waitEmpty: %v", err) - } - - // Stop the I/O goroutine. - a.stop(5 * time.Second) - - // Verify all 3 batches were received. - mu.Lock() - if len(received) != 3 { - t.Fatalf("received %d batches, want 3", len(received)) - } - mu.Unlock() - - // Verify no error. - if err := a.checkError(); err != nil { - t.Fatalf("unexpected error: %v", err) - } -} - -func TestQwpAsyncIoLoopServerError(t *testing.T) { - // Mock server that returns an error ACK on the second message. - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - var seq int64 - for { - _, _, err := conn.Read(context.Background()) - if err != nil { - return - } - if seq == 1 { - conn.Write(context.Background(), websocket.MessageBinary, - buildAckError(qwpStatusWriteError, seq, "bad batch")) - } else { - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) - } - seq++ - } - })) - defer srv.Close() - - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer transport.close(context.Background()) - - a := newQwpAsyncState(2, &transport) - a.start() - - // Send first batch (will succeed). - a.acquireSlot(context.Background()) - a.sendCh <- qwpAsyncBatch{data: []byte{0x01}} - - // Give the I/O loop time to process. - time.Sleep(20 * time.Millisecond) - - // Send second batch (will fail). - a.acquireSlot(context.Background()) - a.sendCh <- qwpAsyncBatch{data: []byte{0x02}} - - // Wait for error to propagate. - a.stop(5 * time.Second) - - err := a.checkError() - if err == nil { - t.Fatal("expected error from server") - } - qErr, ok := err.(*QwpError) - if !ok { - t.Fatalf("expected *QwpError, got %T: %v", err, err) - } - if qErr.Status != qwpStatusWriteError { - t.Fatalf("status = %d, want %d", qErr.Status, qwpStatusWriteError) - } -} - -func TestQwpAsyncConcurrentAcquireRelease(t *testing.T) { - a := newQwpAsyncState(4, nil) - - var wg sync.WaitGroup - const goroutines = 8 - const iterations = 100 - - wg.Add(goroutines) - for g := 0; g < goroutines; g++ { - go func() { - defer wg.Done() - for i := 0; i < iterations; i++ { - if err := a.acquireSlot(context.Background()); err != nil { - return - } - a.releaseSlot() - } - }() - } - - wg.Wait() - - a.mu.Lock() - if a.inFlightCount != 0 { - t.Fatalf("inFlightCount = %d, want 0", a.inFlightCount) - } - a.mu.Unlock() -} - -func TestQwpAsyncGoroutineLeakOnClose(t *testing.T) { - // Verify the I/O goroutine exits cleanly after stop(). - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - var seq int64 - for { - _, _, err := conn.Read(context.Background()) - if err != nil { - return - } - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) - seq++ - } - })) - defer srv.Close() - - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - - a := newQwpAsyncState(2, &transport) - a.start() - - // Send a batch and wait for ACK. - a.acquireSlot(context.Background()) - a.sendCh <- qwpAsyncBatch{data: []byte{0x01}} - if err := a.waitEmpty(context.Background()); err != nil { - t.Fatalf("waitEmpty: %v", err) - } - - // Stop should close the channel and wait for both goroutines. - a.stop(5 * time.Second) - - // Verify both done channels are closed (goroutines exited). - for name, ch := range map[string]chan struct{}{ - "sender": a.doneSender, - "receiver": a.doneReceiver, - } { - select { - case <-ch: - // Good. - default: - t.Fatalf("%s done channel not closed after stop()", name) - } - } - - // Verify stopped flag is set. - a.mu.Lock() - if !a.stopped { - t.Fatal("stopped flag not set after stop()") - } - a.mu.Unlock() - - transport.close(context.Background()) -} - -func TestQwpAsyncCloseAfterError(t *testing.T) { - // Verify Close works correctly after an I/O error in async mode. - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - // Close immediately to cause an error on the next send. - conn.Close(websocket.StatusGoingAway, "bye") - })) - defer srv.Close() - - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 2) - if err != nil { - t.Fatal(err) - } - - // Add a row. - s.Table("t").Int64Column("x", 1).AtNow(context.Background()) - - // Flush will fail (server closed connection). - err = s.Flush(context.Background()) - // Error is expected since the server closed the connection. - t.Logf("Flush error (expected): %v", err) - - // Close should not panic or hang. - closeErr := s.Close(context.Background()) - t.Logf("Close error: %v", closeErr) - - // Double close should return the standard error. - err = s.Close(context.Background()) - if err != errDoubleSenderClose { - t.Fatalf("double close: got %v, want errDoubleSenderClose", err) - } -} - -func TestQwpAsyncCloseUnresponsiveServer(t *testing.T) { - // Verify that Close() completes within a reasonable timeout even - // when the server accepts the WebSocket connection and reads - // messages but never sends ACKs. Without a cancellable context in - // the I/O goroutine, sendMessage or readAck would block forever - // and Close() would hang. - - // blockForever keeps the server handler alive but never sends ACKs. - blockForever := make(chan struct{}) - defer close(blockForever) - - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - - // Read messages but never ACK — simulate an unresponsive server. - for { - _, _, err := conn.Read(context.Background()) - if err != nil { - return - } - // Block instead of sending an ACK. - <-blockForever - } - })) - defer srv.Close() - - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 2) - if err != nil { - t.Fatal(err) - } - // Use a short close timeout for this test so it doesn't take 5s. - s.closeTimeout = 500 * time.Millisecond - - // Insert a row and start async flush (enqueue to I/O goroutine). - s.Table("t").Int64Column("x", 1).AtNow(context.Background()) - // Manually enqueue so we have an in-flight batch. - s.enqueueFlush(context.Background()) - - // Close must complete within 3 seconds. Without context - // cancellation, the I/O goroutine would block forever on - // readAck(context.Background()). - done := make(chan error, 1) - go func() { - done <- s.Close(context.Background()) - }() - - select { - case err := <-done: - // Close completed — it should return an error (cancelled context). - t.Logf("Close returned: %v", err) - case <-time.After(3 * time.Second): - t.Fatal("Close() did not complete within 3 seconds — I/O goroutine is stuck") - } -} - -// TestQwpAsyncCumulativeAck exercises the Java-aligned cumulative-ACK -// behaviour: the server receives several batches and coalesces them -// into a single ACK whose sequence covers all of them. The client -// must release multiple in-flight slots from that one ACK instead of -// wedging waiting for a 1:1 correspondence. -func TestQwpAsyncCumulativeAck(t *testing.T) { - const batches = 3 - - // Server delays ACKing until all batches have been read, then - // emits one cumulative ACK (sequence = last batch index). - read := make(chan struct{}, batches) - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - - for i := 0; i < batches; i++ { - if _, _, err := conn.Read(context.Background()); err != nil { - return - } - read <- struct{}{} - } - // One ACK covers batches 0..batches-1. - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(int64(batches-1))) - // Drain any further reads until the client closes, so the - // handler stays alive for the receiver's post-ACK checks. - for { - if _, _, err := conn.Read(context.Background()); err != nil { - return - } - } - })) - defer srv.Close() - - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer transport.close(context.Background()) - - a := newQwpAsyncState(batches, &transport) - a.start() - - for i := 0; i < batches; i++ { - if err := a.acquireSlot(context.Background()); err != nil { - t.Fatalf("acquireSlot %d: %v", i, err) - } - a.sendCh <- qwpAsyncBatch{data: []byte{byte(i)}} - } - - // Wait for the server to confirm all batches landed before the - // cumulative ACK goes out. - for i := 0; i < batches; i++ { - select { - case <-read: - case <-time.After(2 * time.Second): - t.Fatalf("server never received batch %d", i) - } - } - - if err := a.waitEmpty(context.Background()); err != nil { - t.Fatalf("waitEmpty: %v", err) - } - - a.mu.Lock() - if a.ackedSequence != int64(batches-1) { - t.Fatalf("ackedSequence = %d, want %d", a.ackedSequence, batches-1) - } - if a.inFlightCount != 0 { - t.Fatalf("inFlightCount = %d, want 0", a.inFlightCount) - } - a.mu.Unlock() - - a.stop(2 * time.Second) -} - -// TestQwpAsyncServerOverAcksIsProtocolError verifies the client rejects -// an ACK whose cumulative sequence exceeds the number of batches sent. -func TestQwpAsyncServerOverAcksIsProtocolError(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - if _, _, err := conn.Read(context.Background()); err != nil { - return - } - // Client has sent exactly 1 batch (sequence 0); ACK seq=5. - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(5)) - })) - defer srv.Close() - - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer transport.close(context.Background()) - - a := newQwpAsyncState(2, &transport) - a.start() - - a.acquireSlot(context.Background()) - a.sendCh <- qwpAsyncBatch{data: []byte{0x00}} - - // The receiver must surface an ioErr and waitEmpty should return it. - err := a.waitEmpty(context.Background()) - if err == nil { - t.Fatal("expected protocol error, got nil") - } - if !strings.Contains(err.Error(), "server acknowledged sequence 5") { - t.Fatalf("error should mention the over-ACK, got: %v", err) - } - - a.stop(2 * time.Second) -} - -// TestQwpAsyncErrorAckCarriesSequence checks that an error ACK's -// sequence field reaches the caller through QwpError, so callers can -// identify which batch failed (matches Java's "Server error for batch N"). -func TestQwpAsyncErrorAckCarriesSequence(t *testing.T) { - const failingSeq = 2 - - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - var seq int64 - for { - _, _, err := conn.Read(context.Background()) - if err != nil { - return - } - if seq == failingSeq { - conn.Write(context.Background(), websocket.MessageBinary, - buildAckError(qwpStatusWriteError, seq, "bad batch")) - return - } - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) - seq++ - } - })) - defer srv.Close() - - var transport qwpTransport - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - if err := transport.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer transport.close(context.Background()) - - a := newQwpAsyncState(4, &transport) - a.start() - - for i := 0; i <= failingSeq; i++ { - if err := a.acquireSlot(context.Background()); err != nil { - // Once the error is set later batches may not get slots. - break - } - a.sendCh <- qwpAsyncBatch{data: []byte{byte(i)}} - } - - // Give the receiver a moment to process the error ACK, then stop. - a.stop(2 * time.Second) - - err := a.checkError() - if err == nil { - t.Fatal("expected error from server") - } - qErr, ok := err.(*QwpError) - if !ok { - t.Fatalf("expected *QwpError, got %T: %v", err, err) - } - if qErr.Sequence != failingSeq { - t.Fatalf("sequence = %d, want %d", qErr.Sequence, failingSeq) - } - if qErr.Status != qwpStatusWriteError { - t.Fatalf("status = %d, want %d", qErr.Status, qwpStatusWriteError) - } -} diff --git a/qwp_sender_cursor.go b/qwp_sender_cursor.go new file mode 100644 index 00000000..825d7699 --- /dev/null +++ b/qwp_sender_cursor.go @@ -0,0 +1,1014 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "io" + "path/filepath" + "time" +) + +// qwpSfDefaultSenderId is used when sf_dir is set but sender_id is +// not. Single-sender deployments get zero-config; multi-sender +// users must override per spec. +const qwpSfDefaultSenderId = "default" + +// qwpSfDefaultMaxBytes is the default per-segment cap. Mirrors +// Java's 4 MiB. +const qwpSfDefaultMaxBytes int64 = 4 * 1024 * 1024 + +// qwpSfDefaultMaxTotalBytes is the default total cap when sf_dir +// is set. Mirrors Java's 10 GiB SF default. +const qwpSfDefaultMaxTotalBytes int64 = 10 * 1024 * 1024 * 1024 + +// qwpSfDefaultMemoryMaxTotalBytes is the default total cap when +// sf_dir is empty (memory mode cursor). Mirrors Java's 128 MiB +// memory-mode default. +const qwpSfDefaultMemoryMaxTotalBytes int64 = 128 * 1024 * 1024 + +// qwpSfDefaultCloseFlushTimeout mirrors Java's 5-second default. +const qwpSfDefaultCloseFlushTimeout = 5 * time.Second + +// qwpCursorMode reports whether the sender is wired to the cursor +// engine + send loop. Memory mode (the only mode in this PR's +// initial cut) returns false. +func (s *qwpLineSender) qwpCursorMode() bool { + return s.cursorEngine != nil +} + +// newQwpCursorLineSender constructs a sender that publishes its +// flushed batches into the supplied cursor engine. The send loop +// (already started) is responsible for transmitting frames and +// processing ACKs; the sender itself never opens a WebSocket +// connection. Used by the SF (`sf_dir=...`) and — eventually — +// memory-mode cursor paths. +// +// The caller retains ownership of the engine and send loop until +// Close, at which point the sender takes responsibility for +// draining + closing them in order. Reusing an engine across +// senders is not supported. +// +// closeFlushTimeout bounds Close's wait for the engine's ackedFsn +// to catch up to publishedFsn. 0 or negative means "fast close" +// (skip the drain — pending data lives on disk and will be replayed +// on the next sender start in SF mode, or is lost in memory mode). +func newQwpCursorLineSender( + autoFlushRows int, + autoFlushInterval time.Duration, + autoFlushBytes int, + maxBufSize int, + cursorEngine *qwpSfCursorEngine, + cursorSendLoop *qwpSfSendLoop, + closeFlushTimeout time.Duration, +) (*qwpLineSender, error) { + if cursorEngine == nil || cursorSendLoop == nil { + return nil, errors.New("qwp/cursor: engine and send loop must be non-nil") + } + s := &qwpLineSender{ + tableBuffers: make(map[string]*qwpTableBuffer), + globalSymbols: make(map[string]int32), + maxSentSymbolId: -1, + batchMaxSymbolId: -1, + autoFlushRows: autoFlushRows, + autoFlushInterval: autoFlushInterval, + autoFlushBytes: autoFlushBytes, + maxBufSize: maxBufSize, + inFlightWindow: 1, + closeTimeout: closeFlushTimeout, + cursorEngine: cursorEngine, + cursorSendLoop: cursorSendLoop, + } + // Record the per-segment frame cap (constant for the sender's + // lifetime) so the byte-trigger clamp and the flush-time drop guard + // bound batches to what a single segment can actually hold. + s.maxFrameBytes = cursorEngine.engineMaxFrameBytes() + // Seed effectiveAutoFlushBytes via the same clamp the transport-swap + // callback applies, with no transport yet (server cap unknown). With + // no server cap this yields min(autoFlushBytes, maxFrameBytes*9/10), + // already segment-safe before the first connect. Covers the test + // paths that build a sender directly without wiring the callback, and + // the window in the conf-driven paths between construction and the + // callback install; those then refine the server-cap term via + // applyServerBatchSizeLimit using the connected transport's cap. + s.applyServerBatchSizeLimit(nil) + // Single encoder slot is enough — the cursor engine takes a copy + // of the bytes via tryAppend, so the encoder buffer can be reused + // immediately. No double-buffering needed here. + s.encoder.wb.preallocate(qwpDefaultMicrobatchBufSize) + return s, nil +} + +// newQwpCursorLineSenderFromConf wires a cursor-mode sender from the +// parsed config. Handles BOTH memory mode (sf_dir empty → RAM-backed +// cursor engine) and store-and-forward (sf_dir set → mmapped on-disk +// segments). Resolves the mode-specific defaults, builds the cursor +// engine + send loop with the shared multi-host failover plumbing +// (host tracker, endpoint factory, initial-connect mode, reconnect +// budgets), runs the initial connect (optionally with +// retry-on-failure), and returns a sender ready for the user. +// +// Owns the cursor engine and the send loop; both are torn down on +// sender.Close. +func newQwpCursorLineSenderFromConf(ctx context.Context, conf *lineSenderConfig, opts qwpTransportOpts) (LineSender, error) { + // Resolve defaults. memMode (no sf_dir) selects a RAM-backed cursor + // engine (empty slot path) and the smaller memory-mode total-bytes + // ceiling; everything else — including the multi-host failover + // plumbing below — is shared with store-and-forward. + memMode := conf.sfDir == "" + senderId := conf.senderId + if senderId == "" { + senderId = qwpSfDefaultSenderId + } + sfMaxBytes := conf.sfMaxBytes + if sfMaxBytes <= 0 { + sfMaxBytes = qwpSfDefaultMaxBytes + } + sfMaxTotalBytes := conf.sfMaxTotalBytes + if sfMaxTotalBytes <= 0 { + sfMaxTotalBytes = qwpSfDefaultMaxTotalBytes + if memMode { + sfMaxTotalBytes = qwpSfDefaultMemoryMaxTotalBytes + } + } + if sfMaxTotalBytes < sfMaxBytes { + // Caught earlier in sanitizeQwpConf, but defend in depth + // since defaults could in principle skew this. + return nil, fmt.Errorf("sf_max_total_bytes (%d) must be >= sf_max_bytes (%d)", + sfMaxTotalBytes, sfMaxBytes) + } + appendDeadline := time.Duration(conf.sfAppendDeadlineMillis) * time.Millisecond + if appendDeadline <= 0 { + appendDeadline = qwpSfEngineDefaultAppendDeadline + } + reconnectMaxDuration := time.Duration(conf.reconnectMaxDurationMillis) * time.Millisecond + if reconnectMaxDuration <= 0 { + reconnectMaxDuration = qwpSfDefaultReconnectMaxDuration + } + reconnectInitialBackoff := time.Duration(conf.reconnectInitialBackoffMillis) * time.Millisecond + if reconnectInitialBackoff <= 0 { + reconnectInitialBackoff = qwpSfDefaultReconnectInitialBackoff + } + reconnectMaxBackoff := time.Duration(conf.reconnectMaxBackoffMillis) * time.Millisecond + if reconnectMaxBackoff <= 0 { + reconnectMaxBackoff = qwpSfDefaultReconnectMaxBackoff + } + closeFlushTimeout := qwpSfDefaultCloseFlushTimeout + if conf.closeFlushTimeoutSet { + // User explicitly set the value. <= 0 means "fast close". + closeFlushTimeout = time.Duration(conf.closeFlushTimeoutMillis) * time.Millisecond + } + + // Slot path = //. Empty in memory mode → the + // cursor engine allocates RAM-backed segments instead of opening + // mmapped files under the slot directory. + slotPath := "" + if !memMode { + slotPath = filepath.Join(conf.sfDir, senderId) + } + + // Build the cursor engine first — it owns the slot lock and on-disk + // recovery. + engine, err := qwpSfNewCursorEngine(slotPath, sfMaxBytes, sfMaxTotalBytes, appendDeadline) + if err != nil { + return nil, err + } + + // Failover plumbing (failover.md §2 / §13.6). The tracker is + // shared across every caller drawing from this addr= list: the + // foreground I/O loop, the initial-connect-sync path, and each + // orphan drainer spawned below. Per-caller `previousIdx` slots + // (§2.3) live on the qwpSfSendLoop instances, not on the tracker + // — mid-stream demotes stay scoped to their loop while PickNext + // classifications inform every caller on the next walk. + scheme := "ws" + if conf.tlsMode != tlsDisabled { + scheme = "wss" + } + // The ingress endpoint never sends SERVER_INFO and the client never + // expects one (per the wire spec, ingress is role- and zone-blind); + // role/zone-aware endpoint selection is egress-only. Pass "" for + // clientZone and qwpTargetAny for the role filter so every reachable + // host binds regardless of the configured zone=/target=. Both hints + // are accepted at config time but inert on ingest; the server's + // 421 + X-QuestDB-Role upgrade reject keeps writes off replicas + // (see qwp_sf_round_walk.go). + tracker := newQwpHostTracker(len(conf.endpoints), "", qwpTargetAny) + factory := qwpSfBuildEndpointFactory(conf.endpoints, scheme, opts, conf.dumpWriter) + + // Initial connect — three modes: + // - InitialConnectOff: one single-round walk through every + // configured endpoint, terminal if all + // fail (no inter-round retry). + // - InitialConnectSync: retry-with-backoff on the calling goroutine. + // - InitialConnectAsync: skip the dial here; the I/O goroutine + // dials in-band on its first iteration. + // The producer experiences backpressure + // (engineAppendBlocking spins) until the + // wire comes up. + var ( + transport *qwpTransport + initialBoundIdx = -1 + ) + switch conf.initialConnectMode { + case InitialConnectSync: + transport, initialBoundIdx, err = qwpSfConnectWithRetry(ctx, factory, tracker, + reconnectMaxDuration, reconnectInitialBackoff, reconnectMaxBackoff) + case InitialConnectAsync: + transport = nil + default: // InitialConnectOff + // Single-round walk through every configured endpoint — no + // inter-host backoff, no retry across rounds. Mirrors Java's + // QwpWebSocketSender.buildAndConnect (failover.md §1.2 / + // §4.2): multi-host config gets a full sweep on initial + // connect, but only one sweep. Use initial_connect_retry for + // retry-with-backoff across multiple sweeps. + walkStart := time.Now() + rr := qwpSfRunSingleRound(ctx, nil, qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + Endpoints: conf.endpoints, + }, -1) + switch { + case rr.Transport != nil: + transport = rr.Transport + initialBoundIdx = rr.Idx + case rr.Terminal != nil: + err = fmt.Errorf("qwp/sf: WebSocket upgrade failed (won't retry): %w", rr.Terminal) + case rr.Cancelled != nil: + err = rr.Cancelled + default: + // Round exhausted: every endpoint dialed without binding. + err = fmt.Errorf("qwp/sf: initial connect failed; %w", + buildExhaustedError(tracker, conf.endpoints, + time.Since(walkStart), rr.Attempts, rr.LastError)) + } + } + if err != nil { + _ = engine.engineClose() + return nil, err + } + + loop := qwpSfNewSendLoop(engine, transport, factory, + qwpSfDefaultParkInterval, + reconnectMaxDuration, reconnectInitialBackoff, reconnectMaxBackoff) + loop.sendLoopSetHostTracker(tracker, initialBoundIdx) + engine.engineSetReconnectStatusGetter(loop.sendLoopReconnectStatus) + // Wire the user-configured server-error API knobs (Phase 5) + // before sendLoopStart so they're visible from the receiver + // goroutine the moment it starts. + resolver := &qwpSfPolicyResolver{ + resolver: conf.errorPolicyResolver, + perCat: conf.errorPolicyPerCat, + global: conf.errorPolicyGlobal, + } + loop.sendLoopSetPolicyResolver(resolver) + loop.sendLoopSetErrorHandler(conf.errorHandler, conf.errorInboxCapacity) + + s, err := newQwpCursorLineSender( + conf.autoFlushRows, + conf.autoFlushInterval, + conf.autoFlushBytes, + conf.maxBufSize, + engine, loop, + closeFlushTimeout, + ) + if err != nil { + _ = loop.sendLoopClose() + _ = engine.engineClose() + return nil, err + } + s.fileNameLimit = conf.fileNameLimit + s.encoder.gorillaDisabled = conf.gorillaDisabled + // Pre-size the encoder buffer for the microbatch role: the cursor + // engine copies each frame on append so one encoder slot suffices, + // but a large auto_flush_bytes warrants a bigger initial buffer to + // avoid repeated grows on the hot path. The qwpDefaultMicrobatchBufSize + // (1 MB) floor was already applied in newQwpCursorLineSender. + if conf.autoFlushBytes*2 > qwpDefaultMicrobatchBufSize { + s.encoder.wb.preallocate(conf.autoFlushBytes * 2) + } + // Seed the byte-trigger clamp from the initial transport (the + // sync-connect branches above populated loop.transport; the + // async branch leaves it nil and the first reconnect callback + // will refresh) and install the swap callback so every + // subsequent connect re-applies the clamp. Both happen before + // sendLoopStart so the I/O goroutine sees the installed + // callback on the very first swap. + loop.sendLoopSetOnTransportSwap(s.applyServerBatchSizeLimit) + s.applyServerBatchSizeLimit(loop.transport.Load()) + loop.sendLoopStart() + + // Orphan adoption (drain_orphans=on). At foreground startup, + // scan /* for sibling slots that hold unacked data and + // spawn a drainer per orphan, capped at max_background_drainers + // concurrent goroutines. Failures drop a .failed sentinel into + // the slot so future foreground starts skip it. + // + // `s` already owns engine + loop at this point. Any failure in + // the orphan-setup block must close `s` (which closes both), + // otherwise we leak the connected sender plus its I/O goroutine, + // transport, and segment manager. defer+success flag covers + // panics; explicit error returns cover any future error path + // added below. + if conf.drainOrphans { + setupOK := false + defer func() { + if !setupOK { + _ = s.closeCursor(ctx) + } + }() + maxDrainers := conf.maxBackgroundDrainers + if maxDrainers <= 0 { + maxDrainers = 4 // matches Java default + } + ownSlot := filepath.Base(slotPath) + orphans := qwpSfScanOrphans(conf.sfDir, ownSlot) + if len(orphans) > 0 { + pool := qwpSfNewDrainerPool(maxDrainers) + for _, orphan := range orphans { + drainer := qwpSfNewOrphanDrainer( + orphan, + sfMaxBytes, sfMaxTotalBytes, + factory, + tracker, + reconnectMaxDuration, reconnectInitialBackoff, reconnectMaxBackoff, + ) + _ = pool.drainerPoolSubmit(ctx, drainer) + } + s.drainerPool = pool + } + setupOK = true + } + + return s, nil +} + +// qwpSfBuildReconnectFactory returns a factory that dials the given +// address with the given options on each call. Used by drainers and +// legacy single-host paths; the idx parameter is accepted for +// signature symmetry with qwpSfBuildEndpointFactory and ignored. +func qwpSfBuildReconnectFactory(address string, opts qwpTransportOpts, dumpWriter io.Writer) qwpSfReconnectFactory { + return func(ctx context.Context, _ int) (*qwpTransport, error) { + var t qwpTransport + t.dumpWriter = dumpWriter + if err := t.connect(ctx, address, opts); err != nil { + return nil, err + } + return &t, nil + } +} + +// qwpSfBuildEndpointFactory returns a factory that dials the +// endpoint at the supplied idx. Used by the foreground SF loop's +// round-walk, where PickNext selects the host. Out-of-range idx +// returns an explicit error so a tracker bug surfaces loudly rather +// than dialing a random peer. +func qwpSfBuildEndpointFactory(endpoints []qwpEndpoint, scheme string, opts qwpTransportOpts, dumpWriter io.Writer) qwpSfReconnectFactory { + return func(ctx context.Context, idx int) (*qwpTransport, error) { + if idx < 0 || idx >= len(endpoints) { + return nil, fmt.Errorf("qwp/sf: endpoint index %d out of range [0, %d)", + idx, len(endpoints)) + } + var t qwpTransport + t.dumpWriter = dumpWriter + wsURL := scheme + "://" + endpoints[idx].String() + if err := t.connect(ctx, wsURL, opts); err != nil { + return nil, err + } + return &t, nil + } +} + +// flushCursor is the explicit-Flush() wire path. It shares +// encoding and the (non-blocking, no-ACK-wait) engine append with +// auto-flush via enqueueCursor, then eagerly surfaces any wire +// failure observed during the append window so a terminal error +// reaches the producer immediately instead of on its next call. +// Mirrors Java: flushAndGetSequence() = flushPendingRows() + +// checkError() (design/qwp-cursor-durability.md decision #1 — +// "flush() never waits for ACK; ACKs are async"). Callers wanting +// server-ACK confirmation pair FlushAndGetSequence with +// AwaitAckedFsn. +// +// A successful enqueue resets the table buffers BEFORE the eager +// error check. Once enqueueCursor returns nil the rows are sealed in +// a segment (an FSN is assigned, the frame is queued for replay), so +// they are no longer pending. The eager check can still return an +// error — a HALT latched by a PREVIOUS batch in the window between +// enqueueCursor's own pre-append check and this one — and that error +// is for an already-published batch. Resetting first keeps those rows +// from being retained: re-sending them after the documented +// close+rebuild recovery would double-write the batch once the SF +// slot replays the sealed frame. Mirrors Java flushPendingRows +// resetting before checkError() throws. +func (s *qwpLineSender) flushCursor(ctx context.Context) error { + if err := s.enqueueCursor(ctx); err != nil { + return err + } + s.resetAfterFlush() + return s.cursorSendLoop.sendLoopCheckError() +} + +// enqueueCursor encodes the pending rows as a self-sufficient QWP +// frame and appends it to the cursor engine. It does NOT wait for +// the server ACK (Java decision #1 in +// design/qwp-cursor-durability.md: "flush() never waits for ACK; +// ACKs are async") — the frame is durable once appended (in-RAM +// for memory mode, on-disk for SF) and the send loop drains + +// replays it in the background. Shared by the auto-flush trigger +// and by flushCursor (explicit Flush()), so the user goroutine is +// never blocked on a server round-trip. +// +// Self-sufficient = full schema definitions for every table + full +// symbol-dict delta from id 0 (Java decision #14). The frame must +// replay correctly against any fresh server connection (post- +// reconnect, post-restart, drainer adopting an orphan slot) — refs +// to schema/symbol IDs the new server has never seen would be +// unrecoverable. +// +// Schema-side: every table block carries its full inline column +// definitions. There is no producer-side schema registry to advance. +// +// Symbol-side: the dict uses a delta encoding (varint-prefixed +// length, then names). We always pass `-1` as the encoder's maxSentId +// so the delta starts at id 0 (self-sufficient frame), and +// batchMaxSymbolId — passed as batchMaxId — bounds how much of +// globalSymbolList goes out (ids 0..batchMaxSymbolId). maxSentSymbolId +// carries the high-water mark across flushes so resetAfterFlush can +// rewind batchMaxSymbolId to it. Both fields do real work here. +func (s *qwpLineSender) enqueueCursor(ctx context.Context) error { + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return err + } + tables, err := s.buildTableEncodeInfo() + if err != nil { + // The only error here is "too many tables in one batch": the + // wire encodes the table count as a uint16, so this fails + // identically on every retry. Like an irreducible over-cap table + // in the per-table split below, retaining the rows would re-fail + // forever and wedge the sender; drop them with a typed error + // naming the count so the sender stays usable. + droppedRows := s.pendingRowCount + s.resetAfterFlush() + return fmt.Errorf("%w [droppedRows=%d]", err, droppedRows) + } + if len(tables) == 0 { + return nil + } + encoded := s.encoder.encodeMultiTableWithDeltaDict( + tables, + s.globalSymbolList, + -1, // self-sufficient: full dict from id 0 + s.batchMaxSymbolId, + ) + // Flush-time cap check. The per-row guard in atWithTimestamp bounds + // individual rows, but the schema and dict-delta bytes the encoder + // adds at message-build time can push a batch of legitimately-sized + // rows past a wire cap — the server-advertised batch cap + // (serverMaxBatchSize) or the per-segment frame cap (maxFrameBytes, + // the largest payload one cursor segment holds). A combined frame + // over either cap cannot go out as-is: the server answers + // ws-close[1009 Message Too Big] and the engine can never append a + // frame larger than one segment. + // + // Such a frame is not doomed when it overruns only because it + // aggregates many tables: enqueueCursorSplit re-encodes each table as + // its own self-sufficient frame and appends every table that fits on + // its own, dropping only a table that is individually over-cap. + // Mirrors Java QwpWebSocketSender.flushPendingRows -> + // flushPendingRowsSplit. + if kind, _ := s.frameCapExceeded(len(encoded)); kind != qwpFrameCapNone { + return s.enqueueCursorSplit(ctx, tables) + } + if _, err := s.cursorEngine.engineAppendBlocking(ctx, encoded); err != nil { + return err + } + if s.batchMaxSymbolId > s.maxSentSymbolId { + s.maxSentSymbolId = s.batchMaxSymbolId + } + return nil +} + +// qwpFrameCapKind identifies which wire cap an encoded frame overruns. +// Both caps treat a non-positive limit as "no limit". +type qwpFrameCapKind int + +const ( + qwpFrameCapNone qwpFrameCapKind = iota // fits every active cap + qwpFrameCapServer // over the server-advertised batch cap + qwpFrameCapSegment // over the per-segment frame cap +) + +// frameCapExceeded reports which wire cap, if any, an encoded frame of +// frameLen bytes overruns. The server-advertised batch cap +// (serverMaxBatchSize) is checked before the per-segment frame cap +// (maxFrameBytes) so its diagnostics win when both bind; an appendable +// frame must satisfy both. Returns (qwpFrameCapNone, 0) when the frame +// fits. No allocation — safe on the flush hot path. +func (s *qwpLineSender) frameCapExceeded(frameLen int) (qwpFrameCapKind, int64) { + if cap := int64(s.serverMaxBatchSize.Load()); cap > 0 && int64(frameLen) > cap { + return qwpFrameCapServer, cap + } + if s.maxFrameBytes > 0 && int64(frameLen) > s.maxFrameBytes { + return qwpFrameCapSegment, s.maxFrameBytes + } + return qwpFrameCapNone, 0 +} + +// enqueueCursorSplit is enqueueCursor's over-cap fallback: it re-encodes +// each pending table as its own self-sufficient single-table frame and +// appends every table whose frame fits a wire cap. A combined frame that +// overruns a cap only because it aggregates many tables flushes in full +// this way — one frame per table. Only a table whose own frame is still +// over-cap is irreducible: re-encoding it on the next flush would fail +// identically forever and wedge the sender, so its rows are dropped and +// named in a typed error while every other table goes out. Mirrors Java +// QwpWebSocketSender.flushPendingRowsSplit. +// +// Each single-table frame carries the full symbol dict from id 0 and the +// full inline schema, exactly like the combined frame, so it replays +// against a fresh server connection on its own. +// +// The retain-on-error contract holds per table: a table is reset only +// once its frame is in a segment, so a transient engineAppendBlocking +// failure (ring full + wire stalled, or ctx cancelled) retains the table +// that failed and every table after it for the next flush, without +// re-sending the tables already appended. +func (s *qwpLineSender) enqueueCursorSplit(ctx context.Context, tables []*qwpTableBuffer) error { + var ( + appended int + droppedRows int + oversize []string + worstKind qwpFrameCapKind + worstCap int64 + worstSize int + txErr error + ) + for _, tb := range tables { + if tb.rowCount == 0 { + continue + } + frame := s.encoder.encodeTableWithDeltaDict( + tb, s.globalSymbolList, -1, s.batchMaxSymbolId) + if kind, capVal := s.frameCapExceeded(len(frame)); kind != qwpFrameCapNone { + // Irreducible: a single table over the cap can never be sent. + // Drop it; the other tables are unaffected. + oversize = append(oversize, tb.tableName) + droppedRows += tb.rowCount + if len(frame) > worstSize { + worstKind, worstCap, worstSize = kind, capVal, len(frame) + } + tb.reset() + continue + } + if _, err := s.cursorEngine.engineAppendBlocking(ctx, frame); err != nil { + // Transient: retain this table and the unprocessed tail for + // the next flush. Tables already appended stay reset so they + // are not re-sent. + txErr = err + break + } + appended++ + tb.reset() + } + + if appended > 0 && s.batchMaxSymbolId > s.maxSentSymbolId { + s.maxSentSymbolId = s.batchMaxSymbolId + } + + if txErr != nil { + // Retain-on-error: the failed and not-yet-reached tables still + // hold their rows. Bring the aggregate counters back in line with + // the surviving buffers; the caller must not reset on error. + s.recomputePendingFromBuffers() + return txErr + } + + // Every table was appended or dropped, so all buffers are empty. + // Resetting here (rather than leaving it to the caller, as the + // single-frame success path does) keeps the producer counters + // consistent with the emptied buffers even when an irreducible-table + // error is returned or a caller skips its own post-flush reset. + s.resetAfterFlush() + if len(oversize) > 0 { + return s.oversizeTableError(worstKind, worstCap, worstSize, oversize, droppedRows) + } + return nil +} + +// recomputePendingFromBuffers rebuilds the aggregate pending-row and +// pending-byte counters from the table buffers, the source of truth. +// Used after a partial flush — a per-table split that stopped on a +// transient append failure — where some buffers were reset and others +// still hold rows. The designated-timestamp cache is dropped so the next +// row re-resolves it against whichever table buffer survives. +func (s *qwpLineSender) recomputePendingFromBuffers() { + rows, bytes := 0, 0 + // dirtyTables is the source of truth for what can hold rows: a + // split flush resets some entries (rowCount 0, contributing + // nothing) and retains the rest; both stay listed. + for _, tb := range s.dirtyTables { + rows += tb.rowCount + bytes += tb.approxDataSize() + } + s.pendingRowCount = rows + s.pendingBytes = bytes + s.cachedDesignatedTs = nil +} + +// oversizeTableError builds the typed error returned when the per-table +// split dropped one or more individually-over-cap tables. It names the +// binding cap of the largest dropped frame, lists the dropped tables, +// and reports the total dropped row count. +func (s *qwpLineSender) oversizeTableError(kind qwpFrameCapKind, capVal int64, msgSize int, tables []string, droppedRows int) error { + switch kind { + case qwpFrameCapServer: + return fmt.Errorf( + "qwp: batch too large for server batch cap, even split per table [oversizeTables=%v, messageSize=%d, serverMaxBatchSize=%d, droppedRows=%d]", + tables, msgSize, capVal, droppedRows) + default: // qwpFrameCapSegment + return fmt.Errorf( + "qwp: batch too large to fit one cursor segment, even split per table [oversizeTables=%v, messageSize=%d, maxFrameBytes=%d, droppedRows=%d]; send fewer rows per flush (or raise sf_max_bytes)", + tables, msgSize, capVal, droppedRows) + } +} + +// buildTableEncodeInfo collects non-empty tables for encoding. +// Every table block carries its full inline column definitions. There +// is no schema-change detection and no per-connection schema registry +// on the client side — matching the c-questdb-client live path. +// Mirrors the Java client's "self-sufficient frames" contract (Java +// spec #14): every replayed frame must stand alone against a fresh +// server connection, so the cursor wire path always carries the +// schema in full. +func (s *qwpLineSender) buildTableEncodeInfo() ([]*qwpTableBuffer, error) { + s.encodeInfoBuf = s.encodeInfoBuf[:0] + // Only dirtyTables can hold rows. The rowCount==0 skip still + // matters: a buffer can be dirty-but-empty (a cancelled row, or a + // per-table reset mid-split that left it listed). + for _, tb := range s.dirtyTables { + if tb.rowCount == 0 { + continue + } + if len(s.encodeInfoBuf) == qwpMaxTablesPerBatch { + return nil, fmt.Errorf( + "qwp: too many tables in one batch: exceeded %d", + qwpMaxTablesPerBatch, + ) + } + s.encodeInfoBuf = append(s.encodeInfoBuf, tb) + } + return s.encodeInfoBuf, nil +} + +// calledFromErrorHandler reports whether the current goroutine is the +// error dispatcher's loop goroutine — i.e. we are running inside a +// user SenderErrorHandler invocation. The handler is documented as +// allowed to call Close() / Flush(); when it does, those calls run off +// the producer goroutine. The producer owns lastErr / hasTable / +// currentTable / pendingRowCount / the tableBuffers map / the +// dirtyTables list / the encoder with no happens-before against this +// goroutine, so the Close()/Flush() paths must NOT touch that state — +// doing so races a producer mid-At(): buildTableEncodeInfo ranges +// dirtyTables while Table() appends to it, and Table() writes the +// tableBuffers map, either of which corrupts state (a racing slice +// range/append, or Go's fatal "concurrent map iteration and map write"). +// +// Cheap on the common path: loopGoid is 0 whenever the dispatcher +// goroutine is not running (no server error has ever been delivered), +// so the runtime.Stack cost of qwpGoid() is only paid once an error has +// actually spun the dispatcher up. The g != 0 guard keeps a goid parse +// failure from matching the loopGoid==0 "not running" sentinel. +func (s *qwpLineSender) calledFromErrorHandler() bool { + if s.cursorSendLoop == nil { + return false + } + d := s.cursorSendLoop.sendLoopDispatcher() + if d == nil { + return false + } + lg := d.loopGoid.Load() + if lg == 0 { + return false + } + g := qwpGoid() + return g != 0 && g == lg +} + +// closeCursor drains the cursor engine and closes the send loop. +// Returns the first non-nil error from drain / loop shutdown / +// engine close. Always best-effort: every subsystem is asked to +// close even if an earlier step errored. +// +// Drain semantics: +// - closeFlushTimeout > 0: block up to that long for ackedFsn ≥ +// publishedFsn. On timeout, returns a drain-timeout error so +// the caller cannot silently lose data — shutdown still +// completes. SF-mode users can recover the unacked tail by +// reopening on the same sf_dir; memory-mode users have no +// recovery path and must treat the timeout as fatal. +// - closeFlushTimeout <= 0: skip the drain entirely (fast close). +func (s *qwpLineSender) closeCursor(ctx context.Context) error { + // A Close() invoked from inside a SenderErrorHandler runs on the + // dispatcher goroutine, not the producer goroutine. Flushing pending + // rows or even reading lastErr / hasTable / pendingRowCount here + // would race a producer still mid-Table()/At() (the C3 + // producer-state race). Skip every producer-state access in that + // case and run only the goroutine-safe teardown below (drain wait, + // send-loop close, engine close, drainer pool). The producer + // surfaces the latched terminal error and then the closed-sender + // error on its next call; its un-flushed in-progress rows were never + // handed off and remain its own to retry (SF mode replays whatever + // was already persisted on the next open). + var firstErr error + if !s.calledFromErrorHandler() { + // Surface any latched fluent-API error (e.g. validation failure + // on Symbol/*Column/Table) so Close() doesn't silently swallow + // it — mirrors the HTTP sender's flush0, which drains + // buf.LastErr() on the close path. Captured first so any + // subsequent enqueue / drain / shutdown error doesn't override + // it: the latched fault is the original user-facing cause and + // downstream failures usually follow from it. + firstErr = s.lastErr + s.lastErr = nil + // Encode any pending rows from the open API call into the engine + // first. Drop the pending in-progress row (no At/AtNow yet) the + // same way Close does in memory mode. + if s.hasTable { + if s.currentTable != nil { + s.currentTable.cancelRow() + } + s.hasTable = false + s.currentTable = nil + } + if s.pendingRowCount > 0 { + // Enqueue the pending rows but do NOT block on ACK here — + // flushCursor's ACK wait is unbounded by ctx alone, and + // would deadlock against a silent server. waitCursorDrain + // below is the single bounded ACK wait, governed by + // closeFlushTimeout. Mirrors Java's flushPendingRows() + + // drainOnClose() split. + if err := s.enqueueCursor(ctx); err != nil { + if firstErr == nil { + firstErr = err + } + } else { + // Retain-on-error: only reset the table buffers once the + // rows are in a segment. A failed enqueue (ring full + + // wire stalled, or ctx cancelled) never persisted them — + // resetting here would silently destroy data. SF-mode + // users recover the tail by reopening on the same sf_dir; + // memory-mode users at least see firstErr. Mirrors the + // autoFlush path and Java's flushPendingRows() contract. + s.resetAfterFlush() + } + } + } + // Wait for drain. + if s.closeTimeout > 0 { + if err := s.waitCursorDrain(ctx); err != nil && firstErr == nil { + firstErr = err + } + } + // Stop the send loop (closes its current transport). + if err := s.cursorSendLoop.sendLoopClose(); err != nil && firstErr == nil { + firstErr = err + } + // Close the engine (closes ring, manager if owned, and slot lock). + if err := s.cursorEngine.engineClose(); err != nil && firstErr == nil { + firstErr = err + } + // Stop the drainer pool last — drainers may still be using the + // reconnect factory (which captures the foreground's address + + // auth) and we want their wire shutdowns to overlap with the + // engine teardown rather than serialize after it. + if s.drainerPool != nil { + s.drainerPool.drainerPoolClose() + } + return firstErr +} + +// waitCursorDrain blocks until ackedFsn ≥ publishedFsn, the +// send-loop reports a terminal error, or the user's ctx / +// closeFlushTimeout expires. On timeout, returns a drain-timeout +// error carrying publishedFsn, ackedFsn, and the count of unacked +// batches — closeCursor captures it as firstErr but still proceeds +// with shutdown so the I/O thread, transport, and segment manager +// always tear down cleanly. Mirrors Java QwpWebSocketSender's +// drainOnClose contract: silently swallowing the timeout would +// hide data loss from users who only call Close() and never call +// Flush() afterwards. +func (s *qwpLineSender) waitCursorDrain(ctx context.Context) error { + deadline := time.Now().Add(s.closeTimeout) + timer := time.NewTimer(s.closeTimeout) + defer timer.Stop() + const pollInterval = 5 * time.Millisecond + tick := time.NewTicker(pollInterval) + defer tick.Stop() + for { + if s.cursorEngine.engineAckedFsn() >= s.cursorEngine.enginePublishedFsn() { + return nil + } + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return err + } + if !time.Now().Before(deadline) { + return s.drainTimeoutError() + } + select { + case <-tick.C: + case <-timer.C: + return s.drainTimeoutError() + case <-ctx.Done(): + return ctx.Err() + } + } +} + +// drainTimeoutError builds the close-drain timeout error. Snapshot +// publishedFsn first so the (target - acked) count cannot go +// negative under a concurrent ACK that lands between the two reads. +func (s *qwpLineSender) drainTimeoutError() error { + target := s.cursorEngine.enginePublishedFsn() + acked := s.cursorEngine.engineAckedFsn() + return fmt.Errorf( + "qwp/cursor: close drain timed out after %s [publishedFsn=%d, ackedFsn=%d] - server did not acknowledge %d pending batches; data may be lost (use a larger close_flush_timeout or smaller batches)", + s.closeTimeout, target, acked, target-acked, + ) +} + +// AckedFsn implements QwpSender.AckedFsn. +func (s *qwpLineSender) AckedFsn() int64 { + return s.cursorEngine.engineAckedFsn() +} + +// AwaitAckedFsn implements QwpSender.AwaitAckedFsn. This is the +// server-ACK confirmation primitive: Flush never blocks on ACKs +// (Java decision #1), so callers wanting delivery confirmation pair +// FlushAndGetSequence's returned FSN with this. It blocks until an ACK +// advances ackedFsn to target — woken directly by the send loop's +// ack-notify channel rather than a poll, so confirmation latency +// tracks the ACK itself — or until the send loop dies, the sender is +// closed, or ctx fires. Send-loop terminal errors surface +// synchronously so the caller can distinguish "still in flight" from +// "permanently failed". +func (s *qwpLineSender) AwaitAckedFsn(ctx context.Context, target int64) error { + if s.closed.Load() { + return errClosedSenderFlush + } + for { + // Subscribe before sampling ackedFsn: acknowledge stores the new + // FSN before it closes this channel, so an ACK that lands between + // the sample below and the blocking select still wakes us. + ackCh := s.cursorEngine.engineAckNotify() + if s.cursorEngine.engineAckedFsn() >= target { + return nil + } + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return err + } + if s.closed.Load() { + // Concurrent Close() stopped the send loop, so ackedFsn is + // frozen and will never advance. Re-check once in case the + // ACK landed between the read above and this load; otherwise + // fail fast rather than wait until ctx fires. + if s.cursorEngine.engineAckedFsn() >= target { + return nil + } + return errClosedSenderFlush + } + select { + case <-ackCh: + // ackedFsn advanced — loop and re-test target. + case <-s.cursorSendLoop.sendLoopDone(): + // The send loop exited: a HALT latched a terminal error or + // Close() tore it down, so ackedFsn is now frozen. A final + // ACK may have landed in the same instant, so re-test target + // before reporting the terminal error or closed state. + if s.cursorEngine.engineAckedFsn() >= target { + return nil + } + if err := s.cursorSendLoop.sendLoopCheckError(); err != nil { + return err + } + return errClosedSenderFlush + case <-ctx.Done(): + if s.cursorEngine.engineAckedFsn() >= target { + return nil + } + return ctx.Err() + } + } +} + +// LastTerminalError implements QwpSender.LastTerminalError. +func (s *qwpLineSender) LastTerminalError() *SenderError { + if s.cursorSendLoop == nil { + return nil + } + return s.cursorSendLoop.sendLoopLastTerminalServerError() +} + +// TotalServerErrors implements QwpSender.TotalServerErrors. +func (s *qwpLineSender) TotalServerErrors() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopTotalServerErrors() +} + +// DroppedErrorNotifications implements QwpSender.DroppedErrorNotifications. +func (s *qwpLineSender) DroppedErrorNotifications() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopDispatcher().droppedNotifications() +} + +// TotalErrorNotificationsDelivered implements +// QwpSender.TotalErrorNotificationsDelivered. +func (s *qwpLineSender) TotalErrorNotificationsDelivered() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopDispatcher().totalDelivered() +} + +// TotalReconnectAttempts implements QwpSender.TotalReconnectAttempts. +func (s *qwpLineSender) TotalReconnectAttempts() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopTotalReconnectAttempts() +} + +// TotalReconnectsSucceeded implements QwpSender.TotalReconnectsSucceeded. +func (s *qwpLineSender) TotalReconnectsSucceeded() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopTotalReconnects() +} + +// TotalFramesReplayed implements QwpSender.TotalFramesReplayed. +func (s *qwpLineSender) TotalFramesReplayed() int64 { + if s.cursorSendLoop == nil { + return 0 + } + return s.cursorSendLoop.sendLoopTotalFramesReplayed() +} + +// TotalBackpressureStalls implements QwpSender.TotalBackpressureStalls. +func (s *qwpLineSender) TotalBackpressureStalls() int64 { + if s.cursorEngine == nil { + return 0 + } + return s.cursorEngine.engineTotalBackpressureStalls() +} + +// BackgroundDrainers implements QwpSender.BackgroundDrainers. +func (s *qwpLineSender) BackgroundDrainers() []QwpBackgroundDrainer { + if s.drainerPool == nil { + return nil + } + active := s.drainerPool.drainerPoolSnapshot() + if len(active) == 0 { + return nil + } + out := make([]QwpBackgroundDrainer, len(active)) + for i, d := range active { + out[i] = QwpBackgroundDrainer{ + Dir: d.drainerSlotPath(), + FramesPending: d.drainerTargetFsn(), + FramesAcked: d.drainerAckedFsn(), + LastError: d.drainerLastError(), + Failed: d.drainerOutcome() == qwpSfDrainOutcomeFailed, + } + } + return out +} diff --git a/qwp_sender_cursor_test.go b/qwp_sender_cursor_test.go new file mode 100644 index 00000000..2075e7b9 --- /dev/null +++ b/qwp_sender_cursor_test.go @@ -0,0 +1,642 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "runtime" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// newCursorSenderForTest builds a memory-mode cursor sender pointed +// at the given fake server. Returns the sender plus the engine + loop +// (so tests can inspect them) plus a cleanup that closes the sender. +func newCursorSenderForTest(t *testing.T, srv *qwpSfTestServer, autoFlushRows int) (*qwpLineSender, *qwpSfCursorEngine, *qwpSfSendLoop, func()) { + t.Helper() + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + // 5s closeFlushTimeout matches the Java default; long enough + // that drain-waits in tests don't flake under heavy parallel + // test load. + s, err := newQwpCursorLineSender(autoFlushRows, 0, 0, 0, engine, loop, 5*time.Second) + require.NoError(t, err) + cleanup := func() { + _ = s.Close(context.Background()) + } + return s, engine, loop, cleanup +} + +func TestQwpCursorSenderHappyPath(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.True(t, s.qwpCursorMode()) + + for i := 0; i < 5; i++ { + err := s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background()) + require.NoError(t, err, "row %d", i) + } + require.Equal(t, 5, s.pendingRowCount) + require.NoError(t, s.Flush(context.Background())) + // After Flush, pending rows are drained into the engine. + assert.Equal(t, 0, s.pendingRowCount) + // Wait for ackedFsn to catch up — Flush in cursor mode does NOT + // wait for ACKs, so we wait here explicitly. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= engine.enginePublishedFsn() + }, 2*time.Second, 1*time.Millisecond) + // Five frames should have been sent. + assert.Equal(t, int64(1), loop.sendLoopTotalFramesSent(), + "expected 1 multi-row frame, got %d", loop.sendLoopTotalFramesSent()) + assert.Equal(t, int64(1), srv.totalFramesReceived.Load()) +} + +func TestQwpCursorSenderFlushNoRowsIsCheap(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Flush with no pending rows is a no-op. Crucially, it does NOT + // block waiting for in-flight ACKs (Java spec: cursor flush + // never waits for ACK). Should return immediately. + start := time.Now() + require.NoError(t, s.Flush(context.Background())) + elapsed := time.Since(start) + assert.Less(t, elapsed, 50*time.Millisecond, + "Flush(no rows) should return immediately, took %s", elapsed) +} + +// TestQwpCursorSenderFlushWithPendingRowsDoesNotWaitForAck pins the +// headline cursor-mode contract change: Flush / FlushAndGetSequence +// publish the pending batch and return WITHOUT blocking on the server +// ACK (design/qwp-cursor-durability.md decision #1: "flush() never waits +// for ACK; ACKs are async"). TestQwpCursorSenderFlushNoRowsIsCheap covers +// the zero-pending fast path; this exercises the pending-rows branch — +// the one that actually encodes and enqueues a frame — against a server +// that accepts frames but never ACKs. The proof has two halves: the call +// returns promptly, and ackedFsn is still behind publishedFsn when it +// does (so it cannot have waited for the withheld ACK). +func TestQwpCursorSenderFlushWithPendingRowsDoesNotWaitForAck(t *testing.T) { + srv := newSilentAckServer(t) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + // autoFlushRows=0 → rows accumulate until the explicit Flush. + // closeTimeout=100ms keeps the deferred Close fast: the server never + // ACKs, so a long drain-wait would only stall teardown. + s, err := newQwpCursorLineSender(0, 0, 0, 0, engine, loop, 100*time.Millisecond) + require.NoError(t, err) + defer func() { _ = s.Close(context.Background()) }() + + const rows = 5 + for i := 0; i < rows; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + } + require.Equal(t, rows, s.pendingRowCount) + // Precondition: the server has withheld every ACK, so nothing is + // acked yet — the gap Flush must not block on. + require.Equal(t, int64(-1), engine.engineAckedFsn()) + + start := time.Now() + fsn, err := s.FlushAndGetSequence(context.Background()) + elapsed := time.Since(start) + require.NoError(t, err) + + // Returned promptly: it published into the engine and returned rather + // than blocking on an ACK that never comes. 50ms ceiling mirrors the + // no-rows sibling above. + assert.Less(t, elapsed, 50*time.Millisecond, + "Flush(pending rows) must not wait for ACK, took %s", elapsed) + // The batch WAS published (a single multi-row frame → FSN 0) and the + // pending buffer drained. + assert.Equal(t, int64(0), fsn, "single batch publishes FSN 0") + assert.Equal(t, fsn, engine.enginePublishedFsn()) + assert.Equal(t, 0, s.pendingRowCount) + // The crux: the server ACK was NOT awaited. With silentAcks no ACK + // will ever arrive, so ackedFsn stays behind publishedFsn — this is a + // stable post-condition, not a race window. + assert.Equal(t, int64(-1), engine.engineAckedFsn(), + "Flush must return before the (withheld) server ACK advances ackedFsn") +} + +func TestQwpCursorSenderAutoFlushOnRowCount(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 3) + defer cleanup() + + // 7 rows → autoFlushRows=3 should flush twice (after rows 3 and + // 6); 7th row stays pending. + for i := 0; i < 7; i++ { + err := s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background()) + require.NoError(t, err, "row %d", i) + } + assert.Equal(t, 1, s.pendingRowCount) + require.NoError(t, s.Flush(context.Background())) + + // Wait for drain. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= engine.enginePublishedFsn() + }, 2*time.Second, 1*time.Millisecond) + // Three batches: row 3, row 6, and the explicit Flush. + assert.Equal(t, int64(3), loop.sendLoopTotalFramesSent()) +} + +func TestQwpCursorSenderCloseDrainsEngine(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + s, err := newQwpCursorLineSender(0, 0, 0, 0, engine, loop, 5*time.Second) + require.NoError(t, err) + + for i := 0; i < 4; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + } + // Don't call Flush — Close should encode pending rows and drain. + require.NoError(t, s.Close(context.Background())) + // After close, the engine must be fully drained. + assert.Equal(t, engine.enginePublishedFsn(), engine.engineAckedFsn()) + assert.GreaterOrEqual(t, srv.totalFramesReceived.Load(), int64(1)) +} + +func TestQwpCursorSenderCloseDrainTimeoutReturnsError(t *testing.T) { + // Server accepts frames but never ACKs. Close's drain wait must + // time out within closeFlushTimeout AND return a non-nil error + // that names publishedFsn / ackedFsn — silently swallowing it + // would hide data loss from users who never call Flush. + srv := newSilentAckServer(t) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + s, err := newQwpCursorLineSender(0, 0, 0, 0, engine, loop, 100*time.Millisecond) + require.NoError(t, err) + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + start := time.Now() + closeErr := s.Close(context.Background()) + elapsed := time.Since(start) + assert.Less(t, elapsed, 5*time.Second, "Close should not block on un-ACK'd data forever") + require.Error(t, closeErr, "Close must surface the drain timeout, not swallow it") + assert.Contains(t, closeErr.Error(), "drain timed out") + assert.Contains(t, closeErr.Error(), "publishedFsn") + assert.Contains(t, closeErr.Error(), "ackedFsn") +} + +func TestQwpCursorSenderFlushAfterTerminalError(t *testing.T) { + // ParseError defaults to Halt; SchemaMismatch is now Drop and + // would not produce a terminal error. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + // First Flush enqueues; the loop hits the rejection and goes + // terminal. Subsequent Flush calls must surface the error. + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + // Empty Flush after the loop is dead surfaces the terminal error. + err := s.Flush(context.Background()) + require.Error(t, err) +} + +// TestQwpCursorSenderTableEntrySurfacesTerminalError verifies that +// once the I/O loop has latched a terminal error, the next Table() +// call latches it into s.lastErr so the user observes it at the +// following At/AtNow instead of having to call Flush first. This +// matches the spec contract that the producer's next API call sees +// the latched HALT (sf-client.md §14.5). +func TestQwpCursorSenderTableEntrySurfacesTerminalError(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Push one row and Flush so the loop hits the HALT and latches. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + // New row: Table() must observe the latched terminal error and + // arrange for it to surface at AtNow, without the user having + // to Flush first. + err := s.Table("t").Int64Column("v", 2).AtNow(context.Background()) + require.Error(t, err, "AtNow must surface the latched terminal error from Table()") +} + +// TestQwpCursorFlushResetsAfterEnqueueDespiteEagerError reproduces M7. +// FlushAndGetSequence first publishes the pending rows into the cursor +// engine (durable — an FSN is assigned and the frame is queued for +// replay) and only then eagerly samples the send loop's latched error. +// When a HALT latched by a PREVIOUS batch lands in the window between +// the publish and that eager check, the call returns (-1, err) even +// though these rows are already sealed in a segment. If the table +// buffers are not reset on that path, a user following the documented +// close+rebuild recovery re-sends the "failed" batch and double-writes +// it once the SF slot replays the sealed frame. The reset must happen +// as soon as the enqueue succeeds, before the eager error check. +// +// The race is made deterministic by forcing the publish to park: the +// engine ring is filled to its total-bytes cap so the batch's append +// blocks on backpressure. Reaching the park proves the in-enqueue error +// check (which runs before the append) already passed. The test then +// latches the terminal error and frees a segment, so the parked append +// completes — sealing the batch — and the eager check that follows +// surfaces the latched error. +func TestQwpCursorFlushResetsAfterEnqueueDespiteEagerError(t *testing.T) { + const segSize int64 = 4096 + // Cap at two segments: the ring fills after two segment-sized + // frames, so the third append (the batch under test) parks until a + // sealed segment is acked and trimmed. + engine, err := qwpSfNewCursorEngine("", segSize, 2*segSize, 10*time.Second) + require.NoError(t, err) + + // A send loop we never start: nothing mutates lastError except the + // explicit recordFatal below, so the latch timing is entirely under + // the test's control. A nil transport needs a non-nil (unused) + // reconnect factory to satisfy the constructor. + unusedFactory := func(context.Context, int) (*qwpTransport, error) { + return nil, errors.New("reconnect factory must not be called") + } + loop := qwpSfNewSendLoop(engine, nil, unusedFactory, + time.Millisecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + + // closeFlushTimeout=0 → fast close (skip drain) so cleanup never + // blocks on the un-acked tail this test deliberately leaves behind. + s, err := newQwpCursorLineSender(0, 0, 0, 0, engine, loop, 0) + require.NoError(t, err) + defer func() { _ = s.Close(context.Background()) }() + + // Fill the ring to its cap with two segment-sized frames. The first + // fills the active segment exactly; the second rotates into the + // spare and fills it, sealing the first. With both segments full and + // the cap reached, the manager won't provision a third — the next + // append has nowhere to go and must park. + junk := make([]byte, engine.engineMaxFrameBytes()) // one full segment's payload + fsn0, err := engine.engineAppendBlocking(context.Background(), junk) + require.NoError(t, err) + require.Equal(t, int64(0), fsn0) + fsn1, err := engine.engineAppendBlocking(context.Background(), junk) + require.NoError(t, err) + require.Equal(t, int64(1), fsn1) + + // One row for the batch under test. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.Equal(t, 1, s.pendingRowCount) + + errHalt := errors.New("simulated HALT from a previous batch") + + baselineStalls := engine.engineTotalBackpressureStalls() + type flushResult struct { + fsn int64 + err error + } + resCh := make(chan flushResult, 1) + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + go func() { + fsn, err := s.FlushAndGetSequence(ctx) + resCh <- flushResult{fsn, err} + }() + + // Wait until the batch's append has parked on backpressure. The park + // only happens after the in-enqueue error check has passed and the + // frame has been encoded, so latching now lands the HALT in exactly + // the post-publish window M7 describes. + require.Eventually(t, func() bool { + return engine.engineTotalBackpressureStalls() > baselineStalls + }, 5*time.Second, 100*time.Microsecond, + "batch append never parked — ring was not full") + + // Latch the terminal error, then free a segment so the parked append + // completes. The append seals the batch (FSN assigned, durable); the + // eager check that follows surfaces errHalt. + loop.recordFatal(errHalt) + engine.engineAcknowledge(fsn0) // trims the sealed first segment + + var res flushResult + select { + case res = <-resCh: + case <-time.After(15 * time.Second): + t.Fatal("FlushAndGetSequence never returned") + } + + // The call reports failure (the eager error surfaced)... + require.ErrorIs(t, res.err, errHalt) + assert.Equal(t, int64(-1), res.fsn) + // ...but the rows WERE durably published (an FSN was assigned). + require.Equal(t, int64(2), engine.enginePublishedFsn(), + "batch must have been published before the eager error check fired") + + // The fix: a successful enqueue resets the buffers before the eager + // error check, so the published rows are not also retained. Retaining + // them would double-write the batch when the user re-sends after the + // documented close+rebuild recovery and the SF slot replays FSN 2. + assert.Equal(t, 0, s.pendingRowCount, + "buffers must be reset after a durable enqueue even when Flush returns the latched error") + if tb := s.tableBuffers["t"]; tb != nil { + assert.Equal(t, 0, tb.rowCount, "table buffer must be reset after a durable enqueue") + } +} + +// newSilentAckServer creates a fake QWP server that accepts the +// upgrade and reads frames forever, but never sends any ACK. Used +// by close-drain-timeout and AwaitAckedFsn tests where we need an +// ACK gap to materialize. +func newSilentAckServer(t *testing.T) *qwpSfTestServer { + t.Helper() + return newQwpSfTestServer(t, qwpSfTestServerOpts{silentAcks: true}) +} + +func TestQwpCursorSenderAckedFsnTracksEngine(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // Before any publish, both producer-visible accessor and engine + // agree at -1. + assert.Equal(t, int64(-1), s.AckedFsn()) + + for i := 0; i < 3; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + } + require.NoError(t, s.Flush(context.Background())) + + require.Eventually(t, func() bool { + return s.AckedFsn() == engine.enginePublishedFsn() + }, 2*time.Second, 1*time.Millisecond) + assert.GreaterOrEqual(t, s.AckedFsn(), int64(0)) +} + +func TestQwpCursorSenderAwaitAckedFsnHappyPath(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + // autoFlushRows=2 → enqueue happens without blocking on ACK, + // so AwaitAckedFsn does meaningful waiting work. + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 2) + defer cleanup() + + for i := 0; i < 4; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + } + target := engine.enginePublishedFsn() + require.GreaterOrEqual(t, target, int64(0), "auto-flush should have published at least one frame") + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + require.NoError(t, s.AwaitAckedFsn(ctx, target)) + assert.GreaterOrEqual(t, s.AckedFsn(), target) +} + +func TestQwpCursorSenderAwaitAckedFsnTimeout(t *testing.T) { + srv := newSilentAckServer(t) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + // autoFlushRows=1 enqueues the row into the engine on AtNow, + // without blocking on ACK — exactly the auto-flush path users + // pair with AwaitAckedFsn. closeTimeout=100ms keeps the deferred + // Close fast (the server never ACKs). + s, err := newQwpCursorLineSender(1, 0, 0, 0, engine, loop, 100*time.Millisecond) + require.NoError(t, err) + defer func() { _ = s.Close(context.Background()) }() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.Eventually(t, func() bool { + return engine.enginePublishedFsn() >= 0 + }, time.Second, time.Millisecond, "auto-flush should have published the frame") + target := engine.enginePublishedFsn() + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + start := time.Now() + err = s.AwaitAckedFsn(ctx, target) + elapsed := time.Since(start) + require.ErrorIs(t, err, context.DeadlineExceeded, "no ACK was ever sent — must time out") + assert.GreaterOrEqual(t, elapsed, 50*time.Millisecond) + assert.Less(t, elapsed, time.Second) +} + +// TestQwpCursorSenderAwaitAckedFsnConcurrentClose verifies that a +// concurrent Close() unblocks an in-flight AwaitAckedFsn instead of +// letting it spin until the caller's ctx fires. The send loop halts +// on close and ackedFsn freezes below target, so the poll loop must +// observe s.closed and fail fast with errClosedSenderFlush. +func TestQwpCursorSenderAwaitAckedFsnConcurrentClose(t *testing.T) { + srv := newSilentAckServer(t) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + // closeTimeout=0 skips the drain entirely so Close races straight + // into sendLoopClose — the most aggressive shape of the race. + s, err := newQwpCursorLineSender(1, 0, 0, 0, engine, loop, 0) + require.NoError(t, err) + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.Eventually(t, func() bool { + return engine.enginePublishedFsn() >= 0 + }, time.Second, time.Millisecond, "auto-flush should have published the frame") + target := engine.enginePublishedFsn() + + // Long ctx so a hang would manifest as a 5s test stall rather + // than masquerading as a DeadlineExceeded. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + awaitErr := make(chan error, 1) + go func() { + awaitErr <- s.AwaitAckedFsn(ctx, target) + }() + + // Give AwaitAckedFsn a moment to enter its poll loop, then close. + time.Sleep(20 * time.Millisecond) + require.NoError(t, s.Close(context.Background())) + + select { + case err := <-awaitErr: + require.ErrorIs(t, err, errClosedSenderFlush, + "AwaitAckedFsn must surface errClosedSenderFlush when Close races in mid-poll") + case <-time.After(500 * time.Millisecond): + t.Fatal("AwaitAckedFsn did not return after Close — close-observation in the poll loop is missing") + } +} + +func TestQwpSenderAwaitAckedFsnAlreadyAcked(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + + // Flush publishes the batch but does not wait for the ACK; the + // in-process test server ACKs almost immediately, so by the time + // AwaitAckedFsn runs the engine's acked FSN has reached the + // published target and it short-circuits without consuming the + // deadline. + target := engine.enginePublishedFsn() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + start := time.Now() + require.NoError(t, s.AwaitAckedFsn(ctx, target)) + assert.Less(t, time.Since(start), 50*time.Millisecond, + "AwaitAckedFsn must short-circuit when target is already met") + + // A negative target is trivially reached, even with an + // already-cancelled context (the pre-loop check returns first). + cancelled, cancelFn := context.WithCancel(context.Background()) + cancelFn() + require.NoError(t, s.AwaitAckedFsn(cancelled, -1)) +} + +// stableGoroutineCount returns runtime.NumGoroutine() once it has +// settled: it GCs and samples until two successive reads agree (or a +// bounded number of attempts elapse), so a transient teardown +// goroutine doesn't poison the sample. +func stableGoroutineCount() int { + prev := -1 + for i := 0; i < 50; i++ { + runtime.GC() + time.Sleep(10 * time.Millisecond) + n := runtime.NumGoroutine() + if n == prev { + return n + } + prev = n + } + return prev +} + +// TestQwpCursorNoGoroutineLeakOnClose re-creates the goroutine-leak +// coverage that the removed TestQwpAsyncGoroutineLeakOnClose provided +// for the old async state. The cursor model spawns *more* goroutines +// than the async one did — per sender: run(), plus a senderLoop and a +// receiverLoop per connection — all of which Close()/sendLoopClose() +// must join. A leak of even one of them per sender would be invisible +// to every other cursor test (they each build exactly one sender), +// so this drives many open/send/flush/close cycles and asserts the +// goroutine count does not grow with the cycle count. +func TestQwpCursorNoGoroutineLeakOnClose(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + runCycle := func() { + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 0) + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= engine.enginePublishedFsn() + }, 2*time.Second, 1*time.Millisecond, "frame never ACKed") + cleanup() // Close(): joins run() + sender/receiver goroutines. + } + + // Warm-up cycle so the httptest accept machinery and any + // once-initialized globals are already counted in the baseline. + runCycle() + base := stableGoroutineCount() + + const cycles = 25 + for i := 0; i < cycles; i++ { + runCycle() + } + + // Teardown is partly asynchronous (server-side WS conn goroutines + // unwind once the client drops the transport), so give it time to + // settle. A per-cycle leak across run()/senderLoop/receiverLoop + // would add ~3×25 goroutines — far past the constant slack — so + // this stays sensitive without flaking on transient runtime/server + // goroutines. + const slack = 8 + var got int + require.Eventuallyf(t, func() bool { + got = stableGoroutineCount() + return got <= base+slack + }, 10*time.Second, 100*time.Millisecond, + "goroutine count did not return to baseline after %d cursor "+ + "open/send/flush/close cycles", cycles) + assert.LessOrEqualf(t, got, base+slack, + "goroutine count grew from %d to %d across %d cycles — Close "+ + "is leaking cursor send-loop goroutines", base, got, cycles) +} diff --git a/qwp_sender_error_api_test.go b/qwp_sender_error_api_test.go new file mode 100644 index 00000000..63cb6e22 --- /dev/null +++ b/qwp_sender_error_api_test.go @@ -0,0 +1,242 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestQwpSenderLastTerminalErrorAndCounters drives a HALT-policy +// rejection and asserts: +// - LastTerminalError returns the typed payload +// - TotalServerErrors is 1 +// - errors.As on Flush unwraps the same SenderError +// - FlushAndGetSequence returns the expected published FSN before +// the rejection +func TestQwpSenderLastTerminalErrorAndCounters(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // First Flush enqueues a row; the receiver classifies the rejection. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _, _ = s.FlushAndGetSequence(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + se := s.LastTerminalError() + require.NotNil(t, se, "LastTerminalError should be non-nil after halt") + assert.Equal(t, CategoryParseError, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + assert.Equal(t, int(QwpStatusParseError), se.ServerStatusByte) + assert.GreaterOrEqual(t, s.TotalServerErrors(), int64(1)) + + // The next producer call (AtNow, after Table() polls the terminal + // latch) returns the typed *SenderError unwrappable via errors.As. + err := s.Table("t").Int64Column("v", 2).AtNow(context.Background()) + require.Error(t, err) + var unwrapped *SenderError + require.True(t, errors.As(err, &unwrapped), + "expected *SenderError, got %T: %v", err, err) + assert.Equal(t, CategoryParseError, unwrapped.Category) + assert.Contains(t, unwrapped.ServerMessage, "rejected") +} + +// TestQwpSenderFlushAndGetSequenceHappyPath asserts the returned FSN +// monotonically increases across successful flushes. +func TestQwpSenderFlushAndGetSequenceHappyPath(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + s, _, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + fsn1, err := s.FlushAndGetSequence(context.Background()) + require.NoError(t, err) + assert.GreaterOrEqual(t, fsn1, int64(0)) + + require.NoError(t, s.Table("t").Int64Column("v", 2).AtNow(context.Background())) + fsn2, err := s.FlushAndGetSequence(context.Background()) + require.NoError(t, err) + assert.Greater(t, fsn2, fsn1, "FSN should advance across flushes") + + // Empty FlushAndGetSequence returns the current published FSN + // without error. + fsn3, err := s.FlushAndGetSequence(context.Background()) + require.NoError(t, err) + assert.Equal(t, fsn2, fsn3, "empty FlushAndGetSequence should not advance FSN") +} + +// TestQwpSenderHandlerInvokedOnDrop wires a custom error handler via +// the loop setter, drives a Drop-policy rejection, and asserts the +// handler observes the SenderError before LastTerminalError stays nil +// (Drop does not latch a terminal error). +func TestQwpSenderHandlerInvokedOnDrop(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, // default Drop + rejectFirstNFrames: 1, + }) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // We need a handler to capture deliveries; default loud handler + // just logs. Inject via loop setter (legitimate during test + // since the sender is built but not yet receiving frames). + gotCh := make(chan *SenderError, 4) + loop.sendLoopSetErrorHandler(func(e *SenderError) { + select { + case gotCh <- e: + default: + } + }, 16) + + // Need a fresh batch to actually send. + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + require.NoError(t, s.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + + select { + case se := <-gotCh: + assert.Equal(t, CategorySchemaMismatch, se.Category) + assert.Equal(t, PolicyDropAndContinue, se.AppliedPolicy) + case <-time.After(3 * time.Second): + t.Fatal("handler not invoked within deadline") + } + // Drop does NOT latch terminal. + assert.Nil(t, s.LastTerminalError()) + // totalServerErrors saw the Drop. + assert.GreaterOrEqual(t, s.TotalServerErrors(), int64(1)) + assert.GreaterOrEqual(t, s.TotalErrorNotificationsDelivered(), int64(1)) +} + +// TestQwpSenderInboxOverflowBumpsCounter asserts that flooding a slow +// handler bumps DroppedErrorNotifications without stalling the I/O +// path. +func TestQwpSenderInboxOverflowBumpsCounter(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + }) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + release := make(chan struct{}) + loop.sendLoopSetErrorHandler(func(e *SenderError) { + <-release + }, qwpSfMinErrorInboxCapacity) + defer close(release) + + for i := 0; i < 200; i++ { + require.NoError(t, s.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + require.NoError(t, s.Flush(context.Background())) + } + require.Eventually(t, func() bool { + return s.DroppedErrorNotifications() > 0 + }, 5*time.Second, 10*time.Millisecond, + "DroppedErrorNotifications never increased: dropped=%d delivered=%d", + s.DroppedErrorNotifications(), s.TotalErrorNotificationsDelivered()) +} + +// TestQwpSenderLastTerminalErrorMessageContainsServerMessage drives +// rejection with an explicit message and asserts the message survives +// to the SenderError's ServerMessage field. +func TestQwpSenderLastTerminalErrorMessageContainsServerMessage(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusInternalError}) + defer srv.Close() + + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + _ = s.Flush(context.Background()) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + se := s.LastTerminalError() + require.NotNil(t, se) + assert.True(t, strings.Contains(se.ServerMessage, "rejected"), + "expected 'rejected' in ServerMessage, got %q", se.ServerMessage) +} + +// TestDeprecatedQwpErrorBridge pins the v4.2.0 compatibility shim: the +// historical errors.As(err, &qwpErr) pattern must keep working against +// a *SenderError, with the documented field mapping, while the new +// errors.As(err, &se) path is left intact. +func TestDeprecatedQwpErrorBridge(t *testing.T) { + var err error = &SenderError{ + Category: CategorySchemaMismatch, + ServerStatusByte: int(QwpStatusSchemaMismatch), + ServerMessage: "column type mismatch", + MessageSequence: 42, + FromFsn: 10, + ToFsn: 12, + } + + // Adding (*SenderError).As must not shadow the direct unwrap. + var se *SenderError + require.True(t, errors.As(err, &se)) + assert.Equal(t, CategorySchemaMismatch, se.Category) + + // Historical pattern keeps compiling and is populated. + var qwpErr *QwpError + require.True(t, errors.As(err, &qwpErr)) + assert.Equal(t, QwpStatusSchemaMismatch, qwpErr.Status) + assert.Equal(t, int64(42), qwpErr.Sequence) + assert.Equal(t, "column type mismatch", qwpErr.Message) + assert.Equal(t, + "qwp: server error SCHEMA_MISMATCH (0x03): column type mismatch", + qwpErr.Error()) + + // Protocol violations carried no status byte in v4.2.0; the shim + // reports the zero (OK) byte rather than the -1 sentinel. + err = &SenderError{ + Category: CategoryProtocolViolation, + ServerStatusByte: NoStatusByte, + MessageSequence: NoMessageSequence, + ServerMessage: "policy violation", + } + qwpErr = nil + require.True(t, errors.As(err, &qwpErr)) + assert.Equal(t, QwpStatusCode(0), qwpErr.Status) + assert.Equal(t, "policy violation", qwpErr.Message) +} diff --git a/qwp_sender_fuzz_test.go b/qwp_sender_fuzz_test.go new file mode 100644 index 00000000..f4d1d98a --- /dev/null +++ b/qwp_sender_fuzz_test.go @@ -0,0 +1,1601 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build !windows + +package questdb + +// Go port of QuestDB's QwpSenderFuzzTest (e2e package), slice S1: the +// shared runner plus the simplest entry point (testLoad — default +// fuzz, symbols on, no reorder/skip/dup/new-col/non-ASCII/diff-case +// fuzz tweaks). The class has 27 @Test methods overall; the bulk of +// the work is the runner this file ships, and each remaining variant +// becomes a small entry-point that calls into it with different +// senderFuzzFuzz parameters. +// +// Faithful-port re-architecture (cf. the four ingress-oracle slices): +// +// - The Java oracle (TableData / LineData) compares cursor-printer +// text. The Go port stores typed values per cell and verifies +// via the QWP query client cell-by-cell (same approach as the +// ingress-oracle tests). The "what is the property under test" +// stays the same; the assertion mechanism is Go-idiomatic and +// avoids coupling to the server's CursorPrinter text format. +// - Server tables are NOT pre-created. The producers' first writes +// auto-create each table + its column set on the QuestDB side +// (the test's whole premise). dropAllTables before / after via +// t.Cleanup makes the test fixture-state-independent. +// - Shared atomic timestamp counter (Java AtomicLong) → +// sync/atomic.Int64 — guarantees globally-unique (table, ts) pairs +// across all producer goroutines so there are no ts ties. +// - Per-row "postfix" for STRING/SYMBOL values uses printable ASCII +// A–Z for S1. Java emits a random char from the full BMP; that +// fragility (unpaired surrogates etc.) is replaced with deterministic +// ASCII here. Non-ASCII postfixes are the explicit job of the +// senderFuzzFuzz.nonAsciiValueFactor variant (future slice S2). +// - Row counts are CI-bounded compared to Java; the property under +// test (multi-table multi-thread concurrent ingest, per-type +// round-trip across the wire, no row loss) is unchanged. +// - The Go QwpSender enforces "no same column twice in one row" +// at the client (Java sends both and lets the server apply LWW); +// senderFuzzAddColumnValue early-returns on a duplicate key. The +// duplicatesFactor mechanic therefore reduces to a no-op at the +// wire level in Go; we still record the first value in the +// oracle, so the read-back matches what actually landed. +// - Reproducible via QWP_FUZZ_SEED (shared newFuzzRand). +// +// S3 (this slice) adds the concurrent ALTER COLUMN TYPE thread that +// runs in parallel with the producers when fuzz.columnConvertProb > 0 +// (Java startAlterTableThread). The producers keep emitting the +// original wire type for each column; the WAL apply layer casts to +// the column's current storage type. The value bases were chosen so +// every conversion in the matrix below is lossless: +// +// - STRING ↔ SYMBOL ↔ VARCHAR — same string bytes, dictionary or +// length-prefix encoding only. +// - BYTE ↔ SHORT ↔ INT ↔ LONG — integer-family bases capped at the +// BYTE range (max 119 for "11"*10+9). +// - FLOAT ↔ DOUBLE — values are integer-valued floats (e.g. 70.0), +// exactly representable in both widths. +// - TIMESTAMP → LONG (one-way; raw microsecond int64). +// +// The assertion dispatches on the column's CURRENT wire type +// (b.ColumnType(ci)) rather than the oracle-stored type, so a column +// originally written as INT and altered to BYTE reads via Int8 and +// still matches the stored int64. +// +// Backlog (out of scope): +// - Server-buffer tuning tests (testLoadSmallBuffer, +// forceRecvFragmentationChunkSize) — server-side knob, not +// reachable from a network client without a per-test fixture +// boot. + +import ( + "context" + "fmt" + "math/big" + "math/rand" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +// --- column / symbol catalog (mirrors Java QwpSenderFuzzTest fields) + +// senderFuzzColType identifies a column's logical type for the +// per-type wire emission. Symbols are emitted via Symbol() rather +// than a typed Column(), but share the same value-derivation path so +// they live in the same enum. +type senderFuzzColType int + +const ( + sftString senderFuzzColType = iota + sftDouble + sftByte + sftShort + sftInt + sftFloat + sftChar + sftUUID + sftLong256 + sftTsNano + sftSymbol +) + +// senderFuzzLegacyColumnCount: the first 6 entries in the column +// catalog are STRING/DOUBLE (the legacy ILP types the original Java +// test grew out of). The 8 typed columns that follow are always set +// on every row — skipColumns / addNewColumn restrict their eligible +// pool to legacy indexes so an unset typed cell never appears in +// the oracle (cf. file header note on type-default rendering). +const senderFuzzLegacyColumnCount = 6 + +// senderFuzzNewColumnRandomizeFactor is the postfix range for +// auto-injected "new column" names (e.g. "temperature0" vs +// "temperature1"). Mirrors Java NEW_COLUMN_RANDOMIZE_FACTOR. Unused +// in S1 (newColumnFactor=-1 = off); defined for future slices. +const senderFuzzNewColumnRandomizeFactor = 2 + +// senderFuzzColNameBases catalogs the case variants per column slot. +// Index 0 is the canonical (lowercase) form; indices 1+ are +// case-vary variants for diffCasesInColNames fuzz. QuestDB treats +// column names case-insensitively, so the oracle keys by +// strings.ToLower(name). +var senderFuzzColNameBases = [][]string{ + {"terület", "TERÜLet", "tERülET", "TERÜLET"}, + {"temperature", "TEMPERATURE", "Temperature", "TempeRaTuRe"}, + {"humidity", "HUMIdity", "HumiditY", "HUmiDIty", "HUMIDITY", "Humidity"}, + {"hőmérséklet", "HŐMÉRSÉKLET", "HŐmérséKLEt", "hőMÉRséKlET"}, + {"notes", "NOTES", "NotEs", "noTeS"}, + {"ветер", "Ветер", "ВЕТЕР", "вЕТЕр", "ВетЕР"}, + {"pressure_b", "PRESSURE_B", "Pressure_B"}, + {"pressure_s", "PRESSURE_S", "Pressure_S"}, + {"pressure_i", "PRESSURE_I", "Pressure_I"}, + {"pressure_f", "PRESSURE_F", "Pressure_F"}, + {"flag_c", "FLAG_C", "Flag_C"}, + {"sensor_id_u", "SENSOR_ID_U", "Sensor_Id_U"}, + {"token_l256", "TOKEN_L256", "Token_L256"}, + {"event_at_ns", "EVENT_AT_NS", "Event_At_Ns"}, +} + +var senderFuzzColTypes = []senderFuzzColType{ + sftString, sftDouble, sftDouble, sftDouble, sftString, sftDouble, // legacy 6 + sftByte, sftShort, sftInt, sftFloat, sftChar, sftUUID, sftLong256, sftTsNano, +} + +// senderFuzzColValueBases drives per-row value derivation. The +// integer-family bases (BYTE/SHORT/INT/FLOAT, indices 6..9) are +// chosen so that base*10+digit always fits in the smallest target +// type (BYTE) — once the future ALTER COLUMN TYPE slice narrows a +// column across the integer family, every previously-written value +// still casts losslessly. +var senderFuzzColValueBases = []string{ + "europe", "8", "2", "1", "note", "6", + "5", "9", "11", "7", "M", "u", "l", "1700000000000000000", +} + +var senderFuzzSymbolNameBases = [][]string{ + {"location", "Location", "LOCATION", "loCATion", "LocATioN"}, + {"city", "ciTY", "CITY"}, +} + +var senderFuzzSymbolValueBases = []string{"us-midwest", "London"} + +// senderFuzzNonAsciiChars spans the BMP byte-length spectrum (2/3 +// byte UTF-8) so the wire path exercises multi-byte encoding without +// touching the surrogate pair edge cases — mirrors Java's +// nonAsciiChars (no astral plane chars; all single Go runes). +var senderFuzzNonAsciiChars = []rune{ + 'ó', 'í', 'Á', 'ч', 'Ъ', 'Ж', 'ю', 0x3000, 0x3080, 0x3a55, +} + +const senderFuzzBatchSize = 10 + +// senderFuzzTableNameRandomizeFactor controls the random table-name +// casing on each per-row pick (`WEATHERn` vs `weathern`). QuestDB +// resolves table names case-insensitively, so both forms target the +// same table. +const senderFuzzTableNameRandomizeFactor = 2 + +// senderFuzzMaxNumOfSkippedCols caps how many legacy STRING/DOUBLE +// columns the skipColumns fuzz may remove from one row (Java +// MAX_NUM_OF_SKIPPED_COLS). Typed columns are never eligible. +const senderFuzzMaxNumOfSkippedCols = 2 + +// senderFuzzSymbolsWithSpaceRandomizeFactor: when sendSymbolsWithSpace +// is on, ~50% of symbol value emissions get double-spaces injected at +// a random position. Mirrors Java SEND_SYMBOLS_WITH_SPACE_RANDOMIZE_FACTOR. +const senderFuzzSymbolsWithSpaceRandomizeFactor = 2 + +// --- per-row data + per-table oracle ------------------------------ + +// senderFuzzCell stores the typed value emitted by the sender for a +// single (row, column). On verification we read the typed value back +// through QwpColumnBatch and compare in the same type. +type senderFuzzCell struct { + typ senderFuzzColType + s string + f64 float64 + i64 int64 + ch rune + // uuid limbs + uhi, ulo uint64 + // long256 limbs (l256[0] = LSB) + l256 [4]int64 +} + +// senderFuzzRow groups one batch of cells per timestamp. Because the +// shared atomic timestamp counter guarantees globally-unique ts +// across producers, each row owns its ts unambiguously. +type senderFuzzRow struct { + ts int64 // microseconds + cells map[string]senderFuzzCell +} + +func newSenderFuzzRow(ts int64) *senderFuzzRow { + return &senderFuzzRow{ts: ts, cells: make(map[string]senderFuzzCell, 16)} +} + +// senderFuzzTable is the per-table oracle: rows appended in producer +// order under a lock (concurrent producers can hit the same table), +// then sorted by ts at assertion time to match `ORDER BY ts`. The +// colNames set is the union of every column ever written across the +// table's rows — the assertion uses it to verify that columns the +// schema has but a particular row didn't write are NULL on read-back +// (matches Java's TableData.generateRows NULL-fill behaviour). +type senderFuzzTable struct { + mu sync.Mutex + name string // canonical lowercase + rows []*senderFuzzRow + colNames map[string]struct{} +} + +func newSenderFuzzTable(name string) *senderFuzzTable { + return &senderFuzzTable{name: name, colNames: make(map[string]struct{}, 32)} +} + +func (t *senderFuzzTable) addRow(r *senderFuzzRow) { + t.mu.Lock() + t.rows = append(t.rows, r) + for k := range r.cells { + t.colNames[k] = struct{}{} + } + t.mu.Unlock() +} + +func (t *senderFuzzTable) size() int { + t.mu.Lock() + defer t.mu.Unlock() + return len(t.rows) +} + +// snapshotRowsSorted returns a ts-sorted copy of the table's rows. +func (t *senderFuzzTable) snapshotRowsSorted() []*senderFuzzRow { + t.mu.Lock() + out := make([]*senderFuzzRow, len(t.rows)) + copy(out, t.rows) + t.mu.Unlock() + sort.Slice(out, func(i, j int) bool { return out[i].ts < out[j].ts }) + return out +} + +// --- parameter structs -------------------------------------------- + +// senderFuzzLoad mirrors Java initLoadParameters. Each producer +// runs numIterations × numLines rows distributed across numTables +// tables, with an optional sleep between iterations. +// +// clientAutoFlushRows, when > 0, adds auto_flush_rows=N to the QWP +// connect string so the sender flushes every N rows. Used by tests +// whose fuzz config inflates per-batch frame size past the default +// server recv buffer (mirrors Java's clientAutoFlushRows). +type senderFuzzLoad struct { + numLines int + numIterations int + numThreads int + numTables int + waitMs int + clientAutoFlushRows int +} + +// senderFuzzFuzz mirrors Java initFuzzParameters. -1 means "off" +// for every factor; exerciseSymbols defaults to true (the testLoad +// path). +type senderFuzzFuzz struct { + duplicatesFactor int + columnReorderingFactor int + columnSkipFactor int + newColumnFactor int + nonAsciiValueFactor int + diffCasesInColNames bool + exerciseSymbols bool + sendSymbolsWithSpace bool + columnConvertProb float64 +} + +func defaultSenderFuzzFuzz() senderFuzzFuzz { + return senderFuzzFuzz{ + duplicatesFactor: -1, + columnReorderingFactor: -1, + columnSkipFactor: -1, + newColumnFactor: -1, + nonAsciiValueFactor: -1, + diffCasesInColNames: false, + exerciseSymbols: true, + sendSymbolsWithSpace: false, + columnConvertProb: 0, + } +} + +// --- generation helpers ------------------------------------------- + +// senderFuzzShouldFuzz: a fuzz factor of -1 (or 0) means "off"; any +// positive N fires the fuzz on ~1/N of calls. Mirrors Java +// shouldFuzz. +func senderFuzzShouldFuzz(rnd *rand.Rand, factor int) bool { + return factor > 0 && rnd.Intn(factor) == 0 +} + +// senderFuzzGenerateName picks one case variant for a column / +// symbol name. Used both for catalogued names and for the +// auto-injected new-column names; postfix is non-empty when called +// from the new-column path so the generated identifier doesn't +// collide with a catalogued one. +func senderFuzzGenerateName(bases []string, diffCases, randomize bool, rnd *rand.Rand) string { + caseIdx := 0 + if diffCases { + caseIdx = rnd.Intn(len(bases)) + } + postfix := "" + if randomize { + postfix = strconv.Itoa(rnd.Intn(senderFuzzNewColumnRandomizeFactor)) + } + return bases[caseIdx] + postfix +} + +func senderFuzzGenerateColumnName(idx int, randomize bool, fuzz senderFuzzFuzz, rnd *rand.Rand) string { + return senderFuzzGenerateName(senderFuzzColNameBases[idx], fuzz.diffCasesInColNames, randomize, rnd) +} + +func senderFuzzGenerateSymbolName(idx int, randomize bool, fuzz senderFuzzFuzz, rnd *rand.Rand) string { + return senderFuzzGenerateName(senderFuzzSymbolNameBases[idx], fuzz.diffCasesInColNames, randomize, rnd) +} + +// senderFuzzPickTableName randomly selects one of numTables, with a +// random uppercase/lowercase prefix on each call (QuestDB resolves +// table names case-insensitively). +func senderFuzzPickTableName(numTables int, rnd *rand.Rand) string { + prefix := "weather" + if rnd.Intn(senderFuzzTableNameRandomizeFactor) == 0 { + prefix = "WEATHER" + } + return prefix + strconv.Itoa(rnd.Intn(numTables)) +} + +// senderFuzzPostfixChar returns the single-character suffix appended +// to STRING/SYMBOL value bases. With nonAsciiValueFactor > 0, a +// matching ratio of calls returns a BMP non-ASCII char from the +// catalog (2/3-byte UTF-8) — exercises multi-byte encoding on the +// wire. Otherwise a printable-ASCII letter (Java emits a random +// BMP char; the surrogate edge cases that fragility implies aren't +// the property under test). +func senderFuzzPostfixChar(fuzz senderFuzzFuzz, rnd *rand.Rand) string { + if senderFuzzShouldFuzz(rnd, fuzz.nonAsciiValueFactor) { + return string(senderFuzzNonAsciiChars[rnd.Intn(len(senderFuzzNonAsciiChars))]) + } + return string(rune('A' + rnd.Intn(26))) +} + +// senderFuzzAddColumnValue emits one (typed) column over the QWP +// sender AND records the typed value in the oracle row. +// Faithful to Java QwpSenderFuzzTest.addColumnValue with the +// CursorPrinter "yield text" removed (we compare typed cells, not +// rendered strings). +func senderFuzzAddColumnValue( + typ senderFuzzColType, + valueBase string, + colName string, + qs QwpSender, + row *senderFuzzRow, + fuzz senderFuzzFuzz, + rnd *rand.Rand, +) { + key := strings.ToLower(colName) + // Go-divergence vs Java: the Go QwpSender enforces "no same column + // twice in one row" at the client side (Java sends both writes + // and lets the server apply LWW). Skip the second emission and + // keep the first value in the oracle to match the wire reality. + // Affects: the duplicatesFactor mechanic becomes a client-side + // no-op; addNewColumn / addNewSymbol attempts that collide on the + // generated random postfix likewise skip. Documented in the file + // header. + if _, exists := row.cells[key]; exists { + return + } + switch typ { + case sftDouble: + base, _ := strconv.Atoi(valueBase) + v := float64(base*10 + rnd.Intn(9)) + qs.Float64Column(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, f64: v} + case sftString: + s := valueBase + senderFuzzPostfixChar(fuzz, rnd) + qs.StringColumn(colName, s) + row.cells[key] = senderFuzzCell{typ: typ, s: s} + case sftSymbol: + base := valueBase + if fuzz.sendSymbolsWithSpace && rnd.Intn(senderFuzzSymbolsWithSpaceRandomizeFactor) == 0 && len(base) > 1 { + // Inject double-space at a random interior position + // (mirrors Java sendSymbolsWithSpace branch). + spaceIdx := rnd.Intn(len(base) - 1) + base = base[:spaceIdx] + " " + base[spaceIdx:] + } + s := base + senderFuzzPostfixChar(fuzz, rnd) + qs.Symbol(colName, s) + row.cells[key] = senderFuzzCell{typ: typ, s: s} + case sftByte: + base, _ := strconv.Atoi(valueBase) + v := int8(base*10 + rnd.Intn(9)) + qs.ByteColumn(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, i64: int64(v)} + case sftShort: + base, _ := strconv.Atoi(valueBase) + v := int16(base*10 + rnd.Intn(9)) + qs.ShortColumn(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, i64: int64(v)} + case sftInt: + base, _ := strconv.Atoi(valueBase) + v := int32(base*10 + rnd.Intn(9)) + qs.Int32Column(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, i64: int64(v)} + case sftFloat: + base, _ := strconv.Atoi(valueBase) + v := float32(base*10 + rnd.Intn(9)) + qs.Float32Column(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, f64: float64(v)} + case sftChar: + c := rune(valueBase[0]) + rune(rnd.Intn(10)) + qs.CharColumn(colName, c) + row.cells[key] = senderFuzzCell{typ: typ, ch: c} + case sftUUID: + // Force the top 32 bits of each limb non-zero so neither half + // renders as the LONG_NULL sentinel — the same guard Java + // applies in addColumnValue. + hi := uint64(rnd.Int31()+1)<<32 | uint64(rnd.Uint32()) + lo := uint64(rnd.Int31()+1)<<32 | uint64(rnd.Uint32()) + qs.UuidColumn(colName, hi, lo) + row.cells[key] = senderFuzzCell{typ: typ, uhi: hi, ulo: lo} + case sftLong256: + // Java sends 4 limbs LSB-first via long256Column(name, l0..l3). + // Go's Long256Column takes a big.Int composed MSB-first. We + // store the limbs LSB-first in the cell (l256[0] = l0 = LSB) + // so the readback Long256Word(ci, br, w) maps directly to + // l256[w]. + l0 := (rnd.Int63() & 0x7FFFFFFFFFFFFFFF) | 1 + l1 := (rnd.Int63() & 0x7FFFFFFFFFFFFFFF) | 1 + l2 := (rnd.Int63() & 0x7FFFFFFFFFFFFFFF) | 1 + l3 := (rnd.Int63() & 0x7FFFFFFFFFFFFFFF) | 1 + v := new(big.Int).SetUint64(uint64(l3)) + for _, limb := range []int64{l2, l1, l0} { + v.Lsh(v, 64) + v.Or(v, new(big.Int).SetUint64(uint64(limb))) + } + qs.Long256Column(colName, v) + row.cells[key] = senderFuzzCell{typ: typ, l256: [4]int64{l0, l1, l2, l3}} + case sftTsNano: + // Step in microseconds off the base so the low 3 nanos are + // always zero — matches Java's nanos = base + rnd*1000. + base, _ := strconv.ParseInt(valueBase, 10, 64) + nanos := base + int64(rnd.Intn(1_000_000))*1_000 + qs.TimestampNanosColumn(colName, time.Unix(0, nanos).UTC()) + row.cells[key] = senderFuzzCell{typ: typ, i64: nanos} + } +} + +// senderFuzzGenerateOrdering returns either the identity ordering or +// a shuffled permutation of [0..n), depending on columnReorderingFactor. +// Mirrors Java generateOrdering. +func senderFuzzGenerateOrdering(n, factor int, rnd *rand.Rand) []int { + out := make([]int, n) + for i := 0; i < n; i++ { + out[i] = i + } + if senderFuzzShouldFuzz(rnd, factor) { + rnd.Shuffle(n, func(i, j int) { out[i], out[j] = out[j], out[i] }) + } + return out +} + +// senderFuzzSkipColumns optionally removes 1..senderFuzzMaxNumOfSkippedCols +// legacy STRING/DOUBLE indexes (those < senderFuzzLegacyColumnCount) +// from the ordering. Typed columns are never eligible: an unset +// typed cell renders differently from its type-default sentinel, so +// skipping one would clash with the oracle's "absent → NULL" +// assertion once the future ALTER slice converts types across the +// integer family. Mirrors Java skipColumns. +func senderFuzzSkipColumns(orig []int, factor int, rnd *rand.Rand) []int { + if !senderFuzzShouldFuzz(rnd, factor) { + return orig + } + out := append([]int(nil), orig...) + numToSkip := rnd.Intn(senderFuzzMaxNumOfSkippedCols) + 1 + for i := 0; i < numToSkip; i++ { + // Count legacy-eligible entries still in the slice. + eligible := 0 + for _, idx := range out { + if idx < senderFuzzLegacyColumnCount { + eligible++ + } + } + if eligible == 0 { + break + } + target := rnd.Intn(eligible) + for j := 0; j < len(out); j++ { + if out[j] < senderFuzzLegacyColumnCount { + if target == 0 { + out = append(out[:j], out[j+1:]...) + break + } + target-- + } + } + } + return out +} + +// senderFuzzAddDuplicateColumn re-emits the same column (same name) +// with a freshly random value when duplicatesFactor fires. Server +// resolves duplicates per row as last-write-wins; the oracle's +// row.cells map naturally overwrites the prior cell. +func senderFuzzAddDuplicateColumn(colIdx int, colName string, qs QwpSender, row *senderFuzzRow, fuzz senderFuzzFuzz, rnd *rand.Rand) { + if !senderFuzzShouldFuzz(rnd, fuzz.duplicatesFactor) { + return + } + senderFuzzAddColumnValue(senderFuzzColTypes[colIdx], senderFuzzColValueBases[colIdx], + colName, qs, row, fuzz, rnd) +} + +func senderFuzzAddDuplicateSymbol(symIdx int, symName string, qs QwpSender, row *senderFuzzRow, fuzz senderFuzzFuzz, rnd *rand.Rand) { + if !senderFuzzShouldFuzz(rnd, fuzz.duplicatesFactor) { + return + } + senderFuzzAddColumnValue(sftSymbol, senderFuzzSymbolValueBases[symIdx], + symName, qs, row, fuzz, rnd) +} + +// senderFuzzAddNewColumn picks a random legacy column slot, generates +// a name with a numeric postfix (so it doesn't collide with the +// catalogued name), emits its value, and records it. The server +// auto-adds the column to the table on first write; rows that +// didn't emit it appear as NULL on read. +func senderFuzzAddNewColumn(qs QwpSender, row *senderFuzzRow, fuzz senderFuzzFuzz, rnd *rand.Rand) { + if !senderFuzzShouldFuzz(rnd, fuzz.newColumnFactor) { + return + } + extraColIdx := rnd.Intn(senderFuzzLegacyColumnCount) + colName := senderFuzzGenerateColumnName(extraColIdx, true, fuzz, rnd) + senderFuzzAddColumnValue(senderFuzzColTypes[extraColIdx], senderFuzzColValueBases[extraColIdx], + colName, qs, row, fuzz, rnd) +} + +func senderFuzzAddNewSymbol(qs QwpSender, row *senderFuzzRow, fuzz senderFuzzFuzz, rnd *rand.Rand) { + if !senderFuzzShouldFuzz(rnd, fuzz.newColumnFactor) { + return + } + extraSymIdx := rnd.Intn(len(senderFuzzSymbolNameBases)) + symName := senderFuzzGenerateSymbolName(extraSymIdx, true, fuzz, rnd) + senderFuzzAddColumnValue(sftSymbol, senderFuzzSymbolValueBases[extraSymIdx], + symName, qs, row, fuzz, rnd) +} + +// senderFuzzEmitRow emits one row through the QWP sender + records +// it in the oracle. Symbols first (the QWP ordering invariant the +// ingress-oracle ports already document), then columns. Each symbol +// /column emission may be followed by a same-cell duplicate and a +// brand-new injected column, depending on the duplicatesFactor / +// newColumnFactor settings — faithful to Java generateLine. +func senderFuzzEmitRow( + tableName string, + qs QwpSender, + row *senderFuzzRow, + fuzz senderFuzzFuzz, + rnd *rand.Rand, +) { + qs.Table(tableName) + if fuzz.exerciseSymbols { + symIndexes := senderFuzzSkipColumns( + senderFuzzGenerateOrdering(len(senderFuzzSymbolNameBases), fuzz.columnReorderingFactor, rnd), + fuzz.columnSkipFactor, rnd) + // Note: skipColumns only removes *legacy* indexes; symbol + // indexes are 0/1 (always < legacy threshold) so they ARE + // eligible for skip in Java — preserve that here. + for _, symIdx := range symIndexes { + symName := senderFuzzGenerateSymbolName(symIdx, false, fuzz, rnd) + senderFuzzAddColumnValue(sftSymbol, senderFuzzSymbolValueBases[symIdx], + symName, qs, row, fuzz, rnd) + senderFuzzAddDuplicateSymbol(symIdx, symName, qs, row, fuzz, rnd) + senderFuzzAddNewSymbol(qs, row, fuzz, rnd) + } + } + colIndexes := senderFuzzSkipColumns( + senderFuzzGenerateOrdering(len(senderFuzzColNameBases), fuzz.columnReorderingFactor, rnd), + fuzz.columnSkipFactor, rnd) + for _, colIdx := range colIndexes { + colName := senderFuzzGenerateColumnName(colIdx, false, fuzz, rnd) + senderFuzzAddColumnValue(senderFuzzColTypes[colIdx], senderFuzzColValueBases[colIdx], + colName, qs, row, fuzz, rnd) + senderFuzzAddDuplicateColumn(colIdx, colName, qs, row, fuzz, rnd) + senderFuzzAddNewColumn(qs, row, fuzz, rnd) + } +} + +// --- ALTER COLUMN TYPE driver (S3) -------------------------------- + +// senderFuzzColumnInfo captures the bits of `SHOW COLUMNS` the alter +// loop needs: the column's storage name, its current type as the +// QuestDB type-name string (e.g. "INT", "SYMBOL"), and whether it is +// the table's designated timestamp (which `ALTER COLUMN TYPE` cannot +// touch). +type senderFuzzColumnInfo struct { + name string + typ string + designated bool +} + +// senderFuzzListColumns runs `SHOW COLUMNS FROM '
'` and +// returns one entry per server-side column. Returns nil + nil error +// when the table doesn't exist yet (producers race the auto-create); +// the caller treats that as "skip this attempt and try later". +func senderFuzzListColumns(srv *qwpFuzzServer, table string) ([]senderFuzzColumnInfo, error) { + res, err := srv.execSQL("SHOW COLUMNS FROM '" + table + "'") + if err != nil { + // The server returns an error for unknown tables; the alter + // loop polls into existence as producers auto-create, so this + // is the expected "not yet" path. Suppress the error to keep + // log noise low and let the caller retry. + if strings.Contains(err.Error(), "table does not exist") || + strings.Contains(err.Error(), "does not exist") { + return nil, nil + } + return nil, err + } + nameCol, typeCol, desigCol := -1, -1, -1 + for i, c := range res.Columns { + switch strings.ToLower(c.Name) { + case "column": + nameCol = i + case "type": + typeCol = i + case "designated": + desigCol = i + } + } + if nameCol < 0 || typeCol < 0 { + return nil, fmt.Errorf("SHOW COLUMNS missing expected columns (got %+v)", res.Columns) + } + out := make([]senderFuzzColumnInfo, 0, len(res.Dataset)) + for _, row := range res.Dataset { + if nameCol >= len(row) || typeCol >= len(row) { + continue + } + name, _ := row[nameCol].(string) + if name == "" { + continue + } + typ, _ := row[typeCol].(string) + designated := false + if desigCol >= 0 && desigCol < len(row) { + if b, ok := row[desigCol].(bool); ok { + designated = b + } + } + out = append(out, senderFuzzColumnInfo{name: name, typ: typ, designated: designated}) + } + return out, nil +} + +// senderFuzzChangeColumnTypeTo mirrors Java +// QwpSenderFuzzTest.changeColumnTypeTo. Returns the QuestDB type-name +// string to slot into the ALTER statement, or "" if the column's +// current type is outside the convertible set (CHAR, UUID, LONG256, +// TIMESTAMP_NANO, GEOHASH, DECIMAL*, arrays, etc.). +func senderFuzzChangeColumnTypeTo(rnd *rand.Rand, currentType string) string { + switch strings.ToUpper(currentType) { + case "STRING": + if rnd.Intn(2) == 0 { + return "SYMBOL" + } + return "VARCHAR" + case "SYMBOL": + if rnd.Intn(2) == 0 { + return "STRING" + } + return "VARCHAR" + case "VARCHAR": + if rnd.Intn(2) == 0 { + return "STRING" + } + return "SYMBOL" + case "BYTE", "SHORT", "INT", "LONG": + family := []string{"BYTE", "SHORT", "INT", "LONG"} + for { + t := family[rnd.Intn(len(family))] + if !strings.EqualFold(t, currentType) { + return t + } + } + case "FLOAT": + return "DOUBLE" + case "DOUBLE": + return "FLOAT" + case "TIMESTAMP": + return "LONG" + } + return "" +} + +// senderFuzzAlterTableLoop runs concurrent to the producers when +// fuzz.columnConvertProb > 0. Picks a random table, queries its +// schema, picks the first convertible non-designated column from a +// random start offset, issues `ALTER TABLE … ALTER COLUMN … TYPE …`, +// sleeps 10–110 ms, repeats — until the budget is exhausted, the +// producers signal done, or onFailure fires. Mirrors Java +// startAlterTableThread. +// +// Tolerant of "type is already" (Java tolerates the same; the racy +// schema read can pick a column that was just altered to that type). +// All other server-side errors fail the test via onFailure. +func senderFuzzAlterTableLoop( + srv *qwpFuzzServer, + numTables, numLines int, + convertProb float64, + rnd *rand.Rand, + producersDone <-chan struct{}, + onFailure func(error), +) { + budgetCap := int(float64(numLines*numTables) * convertProb) + if budgetCap <= 0 { + return + } + // +1 so we always attempt at least one ALTER — rnd.Intn is half-open + // [0, budgetCap), and a zero budget would let the loop exit without + // exercising the ALTER path at all (a regression there would otherwise + // pass silently). Java has the same off-by-one; we deviate intentionally. + budget := rnd.Intn(budgetCap) + 1 + for budget > 0 { + select { + case <-producersDone: + return + default: + } + tableName := "weather" + strconv.Itoa(rnd.Intn(numTables)) + cols, err := senderFuzzListColumns(srv, tableName) + if err != nil { + onFailure(fmt.Errorf("list columns %q: %w", tableName, err)) + return + } + if len(cols) == 0 { + time.Sleep(time.Duration(10+rnd.Intn(100)) * time.Millisecond) + continue + } + start := rnd.Intn(len(cols)) + for k := 0; k < len(cols); k++ { + c := cols[(start+k)%len(cols)] + if c.designated { + continue + } + newType := senderFuzzChangeColumnTypeTo(rnd, c.typ) + if newType == "" { + continue + } + sql := "ALTER TABLE '" + tableName + "' ALTER COLUMN \"" + c.name + "\" TYPE " + newType + _, err := srv.execSQL(sql) + if err == nil { + budget-- + } else if !strings.Contains(err.Error(), "type is already") { + onFailure(fmt.Errorf("ALTER %s.%s -> %s: %w", tableName, c.name, newType, err)) + return + } + break + } + time.Sleep(time.Duration(10+rnd.Intn(100)) * time.Millisecond) + } +} + +// --- runner ------------------------------------------------------- + +// senderFuzzRunTest spawns load.numThreads producer goroutines, each +// running load.numIterations × load.numLines rows distributed across +// load.numTables tables. After every producer finishes, drains WAL +// for every table that received rows and asserts the table contents +// cell-by-cell against the oracle. +// +// The runner is the foundational piece every QwpSenderFuzzTest +// scenario consumes; each future entry point is just a small +// configuration of (senderFuzzLoad, senderFuzzFuzz) calling here. +func senderFuzzRunTest(t *testing.T, srv *qwpFuzzServer, load senderFuzzLoad, fuzz senderFuzzFuzz, rnd *rand.Rand) { + t.Helper() + + // One oracle per logical table (canonical lowercase name). + oracles := make(map[string]*senderFuzzTable, load.numTables) + for i := 0; i < load.numTables; i++ { + name := "weather" + strconv.Itoa(i) + oracles[name] = newSenderFuzzTable(name) + } + + // Shared atomic ts counter (Java AtomicLong timestampMicros) — + // every row gets a globally-unique microsecond timestamp, so + // no two rows ever collide on ts. + var tsCounter atomic.Int64 + tsCounter.Store(1_465_839_830_102_300) + + // Wipe any leftover tables from a previous test run, and ensure + // the same on exit. dropAllTables is the fixture's "clean slate" + // primitive — this slice is its first consumer. + srv.dropAllTables(t) + t.Cleanup(func() { srv.dropAllTables(t) }) + + // Concurrent ALTER COLUMN TYPE thread (Java startAlterTableThread). + // Started before the producers so racy alters interleave with the + // very first batches; producers signal completion via producersDone + // and we join the goroutine BEFORE the assertion runs so the schema + // is stable when the oracle reads it back. + producersDone := make(chan struct{}) + var alterWG sync.WaitGroup + var alterErr atomic.Value // holds error + if fuzz.columnConvertProb > 0 { + alterWG.Add(1) + alterSeed := rnd.Int63() + go func() { + defer alterWG.Done() + alterRnd := rand.New(rand.NewSource(alterSeed)) + senderFuzzAlterTableLoop(srv, load.numTables, load.numLines, + fuzz.columnConvertProb, alterRnd, producersDone, + func(e error) { alterErr.Store(e) }) + }() + } + + var wg sync.WaitGroup + errs := make([]error, load.numThreads) + for tid := 0; tid < load.numThreads; tid++ { + threadSeed := rnd.Int63() + wg.Add(1) + go func(tid int, seed int64) { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + errs[tid] = fmt.Errorf("thread %d panicked: %v", tid, rec) + } + }() + tRnd := rand.New(rand.NewSource(seed)) + ctx := context.Background() + conf := fmt.Sprintf("ws::addr=%s;", srv.wsAddr()) + if load.clientAutoFlushRows > 0 { + conf += fmt.Sprintf("auto_flush_rows=%d;", load.clientAutoFlushRows) + } + sctx, scancel := context.WithTimeout(context.Background(), 15*time.Second) + ls, err := LineSenderFromConf(sctx, conf) + scancel() + if err != nil { + errs[tid] = fmt.Errorf("thread %d open: %w", tid, err) + return + } + qs, ok := ls.(QwpSender) + if !ok { + errs[tid] = fmt.Errorf("thread %d: not a QwpSender (%T)", tid, ls) + _ = ls.Close(ctx) + return + } + defer func() { + cctx, ccancel := context.WithTimeout(context.Background(), 30*time.Second) + defer ccancel() + _ = qs.Close(cctx) + }() + published := 0 + for n := 0; n < load.numIterations; n++ { + for j := 0; j < load.numLines; j++ { + ts := tsCounter.Add(1) + tableName := senderFuzzPickTableName(load.numTables, tRnd) + row := newSenderFuzzRow(ts) + senderFuzzEmitRow(tableName, qs, row, fuzz, tRnd) + if err := qs.At(ctx, time.UnixMicro(ts).UTC()); err != nil { + errs[tid] = fmt.Errorf("thread %d at@row %d: %w", tid, published, err) + return + } + base := strings.ToLower(tableName) + if tbl, ok := oracles[base]; ok { + tbl.addRow(row) + } + published++ + if published%senderFuzzBatchSize == 0 { + if err := qs.Flush(ctx); err != nil { + errs[tid] = fmt.Errorf("thread %d flush@%d: %w", tid, published, err) + return + } + } + } + if err := qs.Flush(ctx); err != nil { + errs[tid] = fmt.Errorf("thread %d end-of-iter flush: %w", tid, err) + return + } + if load.waitMs > 0 { + time.Sleep(time.Duration(load.waitMs) * time.Millisecond) + } + } + }(tid, threadSeed) + } + wg.Wait() + close(producersDone) + alterWG.Wait() + for tid, e := range errs { + if e != nil { + t.Fatalf("thread %d: %v", tid, e) + } + } + if v := alterErr.Load(); v != nil { + if e, ok := v.(error); ok && e != nil { + t.Fatalf("alter table thread: %v", e) + } + } + + // Wait for WAL apply per table that has rows, then assert. + for _, tbl := range oracles { + if tbl.size() == 0 { + continue + } + if !senderFuzzPollRows(t, srv, tbl.name, tbl.size(), 60*time.Second) { + t.Logf("server log tail (8K):\n%s", srv.tailLog(8000)) + t.Fatalf("table %q did not reach %d rows", tbl.name, tbl.size()) + } + } + + qc := newBindFuzzClient(t, srv) + for _, tbl := range oracles { + if tbl.size() > 0 { + senderFuzzAssertTable(t, qc, tbl) + } + } +} + +// senderFuzzAssertTable reads tbl via QWP `SELECT * ORDER BY ts` and +// matches each row's typed cells against the oracle. Columns the +// oracle never wrote (none in S1 — every row writes every column) +// are not checked; columns the oracle wrote MUST be present and +// equal in the schema. +func senderFuzzAssertTable(t *testing.T, qc *QwpQueryClient, tbl *senderFuzzTable) { + t.Helper() + want := tbl.snapshotRowsSorted() + + // QuestDB auto-creates the designated timestamp column with the + // default ILP/QWP name "timestamp" when the table is created via + // the first sender.At(...) call (no pre-created DDL here). The + // oracle uses microsecond ts; QWP exposes it as the int64 of that + // column. Java reaches it via metadata.getTimestampIndex(); we + // look it up by name. + const tsColName = "timestamp" + + ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) + defer cancel() + q := qc.Query(ctx, "SELECT * FROM '"+tbl.name+"' ORDER BY "+tsColName) + defer q.Close() + + rowIdx := 0 + for batch, err := range q.Batches() { + if err != nil { + t.Fatalf("table %q query: %v", tbl.name, err) + } + colIdx := make(map[string]int, batch.ColumnCount()) + for i := 0; i < batch.ColumnCount(); i++ { + colIdx[strings.ToLower(batch.ColumnName(i))] = i + } + for br := 0; br < batch.RowCount(); br++ { + if rowIdx >= len(want) { + t.Fatalf("table %q: more rows returned (%d+) than oracle holds (%d)", + tbl.name, rowIdx+1, len(want)) + } + row := want[rowIdx] + rowIdx++ + tsCi, ok := colIdx[tsColName] + if !ok { + t.Fatalf("table %q: SELECT * missing mandatory %q column", + tbl.name, tsColName) + } + if got := batch.Int64(tsCi, br); got != row.ts { + t.Fatalf("table %q row %d ts: want %d got %d", + tbl.name, rowIdx-1, row.ts, got) + } + for name, cell := range row.cells { + ci, present := colIdx[name] + if !present { + t.Fatalf("table %q row ts=%d: column %q set in oracle but absent from schema", + tbl.name, row.ts, name) + } + if batch.IsNull(ci, br) { + t.Fatalf("table %q row ts=%d col %q: expected non-null", tbl.name, row.ts, name) + } + senderFuzzAssertCell(t, batch, ci, br, tbl.name, row.ts, name, cell) + } + // Columns the table schema has (because some OTHER row + // wrote them) but THIS row didn't write must be NULL on + // read-back — mirrors Java TableData.generateRows's + // NULL-fill behaviour. + tbl.mu.Lock() + absent := make([]string, 0, 4) + for name := range tbl.colNames { + if _, set := row.cells[name]; !set { + absent = append(absent, name) + } + } + tbl.mu.Unlock() + for _, name := range absent { + ci, present := colIdx[name] + if !present { + t.Fatalf("table %q row ts=%d: column %q in oracle union but absent from schema", + tbl.name, row.ts, name) + } + if !batch.IsNull(ci, br) { + t.Fatalf("table %q row ts=%d col %q: expected NULL (unset by this row), got non-null", + tbl.name, row.ts, name) + } + } + } + } + if rowIdx != len(want) { + t.Fatalf("table %q: oracle holds %d rows, query returned %d", + tbl.name, len(want), rowIdx) + } +} + +// senderFuzzPollRows is awaitRows with diagnostic-friendly return +// semantics (bool, doesn't t.Fatalf) so the caller can dump the +// server log on timeout. The last execSQL error (if any) is surfaced +// in the timeout log so "server unreachable the whole window" is +// distinguishable from "WAL never caught up" before reading the tail. +func senderFuzzPollRows(t *testing.T, srv *qwpFuzzServer, table string, want int, timeout time.Duration) bool { + t.Helper() + deadline := time.Now().Add(timeout) + q := fmt.Sprintf("SELECT count() FROM '%s'", table) + var lastN int64 + var lastErr error + for { + res, err := srv.execSQL(q) + if err != nil { + lastErr = err + } else if len(res.Dataset) == 1 && len(res.Dataset[0]) == 1 { + if n, ok := toInt64(res.Dataset[0][0]); ok { + lastN = n + if n >= int64(want) { + return true + } + } + } + if time.Now().After(deadline) { + t.Logf("table %q: %d / %d rows after %s (last execSQL err: %v)", + table, lastN, want, timeout, lastErr) + return false + } + time.Sleep(100 * time.Millisecond) + } +} + +func senderFuzzAssertCell(t *testing.T, b *QwpColumnBatch, ci, br int, + tableName string, ts int64, colName string, c senderFuzzCell) { + t.Helper() + // The column's CURRENT wire type — may differ from c.typ when an + // ALTER COLUMN TYPE has narrowed/widened the column between write + // and assertion. For the convertible families (int/float/string), + // dispatch on the current type so the matching typed accessor + // fires; the oracle's stored value casts losslessly by construction + // (see file header). + wt := qwpTypeCode(b.ColumnType(ci)) + switch c.typ { + case sftString, sftSymbol: + // STRING ↔ SYMBOL ↔ VARCHAR — b.String works for all three. + if got := b.String(ci, br); got != c.s { + t.Fatalf("table %q row ts=%d col %q (str, wt=0x%02x): want %q got %q", + tableName, ts, colName, byte(wt), c.s, got) + } + case sftDouble, sftFloat: + var got float64 + switch wt { + case qwpTypeFloat: + got = float64(b.Float32(ci, br)) + case qwpTypeDouble: + got = b.Float64(ci, br) + default: + t.Fatalf("table %q row ts=%d col %q (float family): unexpected wire type 0x%02x", + tableName, ts, colName, byte(wt)) + } + if got != c.f64 { + t.Fatalf("table %q row ts=%d col %q (float family, wt=0x%02x): want %v got %v", + tableName, ts, colName, byte(wt), c.f64, got) + } + case sftByte, sftShort, sftInt: + // Integer family — BYTE/SHORT/INT/LONG are interconvertible. + var got int64 + switch wt { + case qwpTypeByte: + got = int64(b.Int8(ci, br)) + case qwpTypeShort: + got = int64(b.Int16(ci, br)) + case qwpTypeInt: + got = int64(b.Int32(ci, br)) + case qwpTypeLong: + got = b.Int64(ci, br) + default: + t.Fatalf("table %q row ts=%d col %q (int family): unexpected wire type 0x%02x", + tableName, ts, colName, byte(wt)) + } + if got != c.i64 { + t.Fatalf("table %q row ts=%d col %q (int family, wt=0x%02x): want %d got %d", + tableName, ts, colName, byte(wt), c.i64, got) + } + case sftChar: + if got := b.Char(ci, br); got != c.ch { + t.Fatalf("table %q row ts=%d col %q (char): want %q got %q", + tableName, ts, colName, c.ch, got) + } + case sftUUID: + gh := uint64(b.UuidHi(ci, br)) + gl := uint64(b.UuidLo(ci, br)) + if gh != c.uhi || gl != c.ulo { + t.Fatalf("table %q row ts=%d col %q (uuid): want hi=%d lo=%d got hi=%d lo=%d", + tableName, ts, colName, c.uhi, c.ulo, gh, gl) + } + case sftLong256: + for w := 0; w < 4; w++ { + if got := b.Long256Word(ci, br, w); got != c.l256[w] { + t.Fatalf("table %q row ts=%d col %q (long256) w%d: want %d got %d", + tableName, ts, colName, w, c.l256[w], got) + } + } + case sftTsNano: + if got := b.Int64(ci, br); got != c.i64 { + t.Fatalf("table %q row ts=%d col %q (tsnano): want %d got %d", + tableName, ts, colName, c.i64, got) + } + } +} + +// --- entry points ------------------------------------------------- + +// TestQwpFuzzSenderLoad is the Go port of +// QwpSenderFuzzTest.testLoad (the simplest entry point — default +// fuzz, symbols on, no reorder/skip/dup/new-col/non-ASCII). Counts +// are CI-bounded compared to Java's (100, 5, 7, 12, 20). +func TestQwpFuzzSenderLoad(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 20, + }, defaultSenderFuzzFuzz(), r) +} + +// --- S2 fuzz variants --------------------------------------------- +// +// Each test calls senderFuzzRunTest with a different (load, fuzz) +// configuration; the runner itself is unchanged. Counts are +// CI-bounded vs Java. Java enables convertProb=0.05 (ALTER COLUMN +// TYPE) on most of these via the 7-arg initFuzzParameters overload; +// the Go ports set convertProb=0 — the ALTER concurrent thread +// lands as a dedicated S3 slice (see file header). The fuzz +// mechanics under test here (reorder / skip / dup / new-col / +// non-ASCII / diff-case / sendSymbolsWithSpace) are exercised in +// isolation, on a stable schema. + +func TestQwpFuzzSenderLoadLargePayload(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 200, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 10, + }, defaultSenderFuzzFuzz(), r) +} + +func TestQwpFuzzSenderLoadNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.nonAsciiValueFactor = 5 + fuzz.exerciseSymbols = false + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 20, + }, fuzz, r) +} + +func TestQwpFuzzSenderLoadSendSymbolsWithSpace(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.newColumnFactor = 2 + fuzz.sendSymbolsWithSpace = true + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 20, + clientAutoFlushRows: 5, + }, fuzz, r) +} + +func TestQwpFuzzSenderCaseVariationReorderingColumns(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.newColumnFactor = 2 + fuzz.diffCasesInColNames = true + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderCaseVariationReorderingColumnsNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderCaseVariationReorderingColumnsSendSymbolsWithSpace(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.newColumnFactor = 3 + fuzz.diffCasesInColNames = true + fuzz.sendSymbolsWithSpace = true + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + clientAutoFlushRows: 5, + }, fuzz, r) +} + +func TestQwpFuzzSenderNonAsciiValues(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.newColumnFactor = 3 + fuzz.nonAsciiValueFactor = 4 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderNonAsciiValuesNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.nonAsciiValueFactor = 4 + fuzz.exerciseSymbols = false + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingColumns(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.nonAsciiValueFactor = 8 + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingColumnsNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingManyThreads(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 3 + fuzz.newColumnFactor = 2 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 40, numIterations: 2, numThreads: 5, numTables: 3, waitMs: 30, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingNonAscii(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.newColumnFactor = 2 + fuzz.nonAsciiValueFactor = 4 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingSkipColumnsWithNonAscii(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 4 + fuzz.newColumnFactor = 2 + fuzz.nonAsciiValueFactor = 4 + fuzz.diffCasesInColNames = true + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +func TestQwpFuzzSenderReorderingSkipColumnsWithNonAsciiNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 4 + fuzz.nonAsciiValueFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// --- S3 entry points: ALTER COLUMN TYPE in parallel with producers +// +// Each test sets a non-zero columnConvertProb which starts the alter +// loop alongside the producer goroutines. Counts are CI-bounded vs +// the Java reference; the convertProb values match Java exactly. + +// TestQwpFuzzSenderAllMixed is the smoke test for S3: every fuzz +// dial on at once plus convertProb=0.05 — duplicates, reordering, +// skip, new-col injection, non-ASCII postfixes, symbols, and the +// alter loop. If S3 mechanics are wrong, this is the first to fail. +// Port of Java testAllMixed. +func TestQwpFuzzSenderAllMixed(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 3 + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 5 + fuzz.newColumnFactor = 10 + fuzz.nonAsciiValueFactor = 5 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = true + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderAllMixedNoSymbols — Java testAllMixedNoSymbols. +func TestQwpFuzzSenderAllMixedNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 3 + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 5 + fuzz.newColumnFactor = 10 + fuzz.nonAsciiValueFactor = 5 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderAllMixedSingleTable — Java testAllMixedSingleTable +// (numTables=1, otherwise same as AllMixed). +func TestQwpFuzzSenderAllMixedSingleTable(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 3 + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 5 + fuzz.newColumnFactor = 10 + fuzz.nonAsciiValueFactor = 5 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = true + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 1, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderAllMixedSplitPart — Java testAllMixedSplitPart. +// Only newColumnFactor and convertProb are on; everything else off, +// symbols on (per Java's positional args). +func TestQwpFuzzSenderAllMixedSplitPart(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.newColumnFactor = 10 + fuzz.exerciseSymbols = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 1, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderAddColumns — Java testAddColumns (convertProb=0.1). +func TestQwpFuzzSenderAddColumns(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnReorderingFactor = 1 + fuzz.columnSkipFactor = 1 + r.Intn(3) + fuzz.newColumnFactor = 6 + fuzz.exerciseSymbols = true + fuzz.columnConvertProb = 0.1 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 15 + r.Intn(50), numIterations: 2, numThreads: 3, numTables: 1 + r.Intn(4), waitMs: r.Intn(75), + }, fuzz, r) +} + +// TestQwpFuzzSenderAddColumnsNoSymbols — Java testAddColumnsNoSymbols +// (convertProb=0.15). +func TestQwpFuzzSenderAddColumnsNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnSkipFactor = 4 + fuzz.newColumnFactor = 3 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + fuzz.columnConvertProb = 0.15 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 15, numIterations: 2, numThreads: 2, numTables: 5, waitMs: 75, + }, fuzz, r) +} + +// TestQwpFuzzSenderAddConvertColumns — Java testAddConvertColumns +// (highest convertProb, 0.2; sendSymbolsWithSpace also on). +func TestQwpFuzzSenderAddConvertColumns(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.columnSkipFactor = 4 + fuzz.exerciseSymbols = true + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.2 + // sendSymbolsWithSpace inflates per-batch wire size like the + // LoadSendSymbolsWithSpace test — cap auto-flush rows. + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 15, numIterations: 2, numThreads: 2, numTables: 5, waitMs: 75, + clientAutoFlushRows: 5, + }, fuzz, r) +} + +// TestQwpFuzzSenderDuplicatesReorderingColumns — +// Java testDuplicatesReorderingColumns (dup=4, reorder=4, conv=0.05). +func TestQwpFuzzSenderDuplicatesReorderingColumns(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 4 + fuzz.columnReorderingFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderDuplicatesReorderingColumnsNoSymbols — +// Java testDuplicatesReorderingColumnsNoSymbols. +func TestQwpFuzzSenderDuplicatesReorderingColumnsNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 4 + fuzz.columnReorderingFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderDuplicatesReorderingColumnsSendSymbolsWithSpace — +// Java testDuplicatesReorderingColumnsSendSymbolsWithSpace. +func TestQwpFuzzSenderDuplicatesReorderingColumnsSendSymbolsWithSpace(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 4 + fuzz.columnReorderingFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = true + fuzz.sendSymbolsWithSpace = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + clientAutoFlushRows: 5, + }, fuzz, r) +} + +// TestQwpFuzzSenderReorderingSkipDuplicateColumnsWithNonAscii — +// Java testReorderingSkipDuplicateColumnsWithNonAscii. +func TestQwpFuzzSenderReorderingSkipDuplicateColumnsWithNonAscii(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 4 + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 4 + fuzz.nonAsciiValueFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = true + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} + +// TestQwpFuzzSenderLoadSmallBuffer — Java testLoadSmallBuffer +// (the only sender-fuzz @Test method that requires a server-side +// knob). The server is booted with http.recv.buffer.size=2048, so +// the client must cap per-frame bytes well under that or the server +// tears the WS connection down with MESSAGE_TOO_BIG. Java pairs +// recvBufferSize=2048 with clientAutoFlushRows=3; we match exactly. +// +// Requires fixture-launched mode (sidecar JVM with env overrides); +// skips in QDB_FUZZ_ADDR mode. +func TestQwpFuzzSenderLoadSmallBuffer(t *testing.T) { + srv := bootSidecarServer(t, map[string]string{ + "QDB_HTTP_RECV_BUFFER_SIZE": "2048", + }) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + // Java's testLoadSmallBuffer uses the same load shape as testLoad + // (no extra fuzz tweaks); the property under test is "wire frame + // fits in 2048 B with auto_flush_rows=3". + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 20, + clientAutoFlushRows: 3, + }, fuzz, r) +} + +// TestQwpFuzzSenderReorderingSkipDuplicateColumnsWithNonAsciiNoSymbols — +// Java testReorderingSkipDuplicateColumnsWithNonAsciiNoSymbols. +func TestQwpFuzzSenderReorderingSkipDuplicateColumnsWithNonAsciiNoSymbols(t *testing.T) { + srv := fuzzServer(t) + r := newFuzzRand(t) + fuzz := defaultSenderFuzzFuzz() + fuzz.duplicatesFactor = 4 + fuzz.columnReorderingFactor = 4 + fuzz.columnSkipFactor = 4 + fuzz.nonAsciiValueFactor = 4 + fuzz.diffCasesInColNames = true + fuzz.exerciseSymbols = false + fuzz.columnConvertProb = 0.05 + senderFuzzRunTest(t, srv, senderFuzzLoad{ + numLines: 50, numIterations: 2, numThreads: 3, numTables: 4, waitMs: 50, + }, fuzz, r) +} diff --git a/qwp_sender_handler_close_test.go b/qwp_sender_handler_close_test.go new file mode 100644 index 00000000..b0d80f7c --- /dev/null +++ b/qwp_sender_handler_close_test.go @@ -0,0 +1,242 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestQwpSfEngineCloseDuringBackpressuredAppendNoCrash is the C3 +// regression for the engine-level crash (Hazard A in the review). +// +// A SenderErrorHandler is documented as allowed to call Close(). When a +// HALT stalls the wire, the send loop stops draining, the cursor ring +// fills, and the producer parks in engineAppendBlocking's backpressure +// spin — calling appendOrFsn every park interval. Close() then tears +// the engine down on a different goroutine: segmentRingClose swaps the +// active segment to nil and munmaps it while the parked producer is +// still calling appendOrFsn. Pre-fix the producer dereferences the +// just-nil'd active segment. +// +// Memory mode is used deliberately: there the dangling access is a +// recoverable nil-pointer panic, so the failure is assertable rather +// than a process-killing SIGBUS (which is what the equivalent SF-mode +// race produces against the munmapped pages). The same engine-level +// append/close serialization fixes both. +func TestQwpSfEngineCloseDuringBackpressuredAppendNoCrash(t *testing.T) { + const segSize int64 = 96 // 24-byte header + 72-byte payload region + // Cap total bytes at one segment so the manager never provisions a + // hot spare: once the active fills, every further append + // backpressures forever (nothing acks, so no trim frees space). Long + // append deadline so the producer stays parked until we close it. + e, err := qwpSfNewCursorEngine("", segSize, segSize, 30*time.Second) + require.NoError(t, err) + + // Fill the active segment: capacity 72, each frame is 8-byte envelope + // + 16-byte payload = 24, so exactly 3 frames fit. + for i := 0; i < 3; i++ { + _, err := e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.NoError(t, err, "fill frame %d", i) + } + + // Park a producer on the 4th append. It spins in the backpressure + // loop until either the (30s) deadline or the engine is closed under + // it. Any panic is recovered so the test binary survives to assert. + var prodErr error + var prodPanic atomic.Value + done := make(chan struct{}) + go func() { + defer close(done) + defer func() { + if r := recover(); r != nil { + prodPanic.Store(fmt.Sprintf("%v", r)) + } + }() + _, prodErr = e.engineAppendBlocking(context.Background(), make([]byte, 16)) + }() + + // Wait until the producer is genuinely in the backpressure spin + // (stall counter bumps once on the first miss, before the spin). + require.Eventually(t, func() bool { + return e.engineTotalBackpressureStalls() >= 1 + }, 2*time.Second, 50*time.Microsecond, + "producer never entered the backpressure spin") + + // Close the engine out from under the parked producer — exactly what + // a SenderErrorHandler's Close() does on the dispatcher goroutine. + require.NoError(t, e.engineClose()) + + select { + case <-done: + case <-time.After(5 * time.Second): + t.Fatal("parked producer never returned after engineClose") + } + + require.Nil(t, prodPanic.Load(), + "producer crashed dereferencing a torn-down segment: %v", prodPanic.Load()) + require.ErrorIs(t, prodErr, qwpSfErrEngineClosed, + "a producer parked in backpressure must observe a clean closed-engine "+ + "error once the engine is closed, got: %v", prodErr) +} + +// TestQwpSenderCloseFromErrorHandlerSkipsProducerState is the C3 +// regression for the producer-state data race (Hazard B in the review). +// +// The SenderErrorHandler runs on the dispatcher goroutine. The producer +// goroutine owns the table buffers, the encoder, hasTable and +// pendingRowCount with no happens-before against the dispatcher. So +// Close()/Flush() invoked from the handler must NOT flush producer- +// buffered rows or range the tableBuffers map — doing so races a +// producer mid-At(), up to Go's fatal "concurrent map iteration and map +// write". +// +// This is the deterministic half: the producer stages rows and then +// parks while the handler calls Flush() and Close() off the producer +// goroutine. Pre-fix, those calls flush the staged rows (resetting +// pendingRowCount and advancing publishedFsn); post-fix they leave +// producer state untouched. The companion -race test below exercises +// the same path with a genuinely concurrent producer. +func TestQwpSenderCloseFromErrorHandlerSkipsProducerState(t *testing.T) { + // Drop-policy rejection: the handler fires but no terminal error is + // latched, so a handler-side Flush() would otherwise proceed into the + // pending-rows encode path (which ranges tableBuffers). + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusSchemaMismatch}) + defer srv.Close() + + s, engine, loop, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + ctx := context.Background() + producerReady := make(chan struct{}) + handlerDone := make(chan struct{}) + var once sync.Once + loop.sendLoopSetErrorHandler(func(e *SenderError) { + once.Do(func() { + // Wait until the producer has staged its pending rows and + // parked, so this Flush+Close is the only thing touching the + // sender — the behavioral assertion is then race-free. + <-producerReady + // Both calls run on the dispatcher goroutine and must skip + // producer state. Pre-fix they flush the staged rows. + _, _ = s.FlushAndGetSequence(ctx) + _ = s.Close(ctx) + close(handlerDone) + }) + }, 16) + + // Batch 1: one row, flushed. The server drops it, scheduling the + // handler (which then blocks on producerReady). + require.NoError(t, s.Table("t").Int64Column("v", 1).AtNow(ctx)) + require.NoError(t, s.Flush(ctx)) + + // Stage two more rows the handler-side Flush/Close must not touch. + require.NoError(t, s.Table("t").Int64Column("v", 2).AtNow(ctx)) + require.NoError(t, s.Table("t").Int64Column("v", 3).AtNow(ctx)) + require.Equal(t, 2, s.pendingRowCount) + fsnBefore := engine.enginePublishedFsn() + + close(producerReady) // release the handler to Flush()+Close() + + select { + case <-handlerDone: + case <-time.After(5 * time.Second): + t.Fatal("handler never ran Flush()+Close() — drop notification not delivered?") + } + + // Post-fix: the off-producer Flush()/Close() left the staged rows and + // the publish cursor exactly where the producer left them. + assert.Equal(t, 2, s.pendingRowCount, + "off-producer Flush()/Close() must not flush producer-buffered rows") + assert.Equal(t, fsnBefore, engine.enginePublishedFsn(), + "off-producer Flush()/Close() must not publish staged rows") +} + +// TestQwpSenderCloseFromErrorHandlerConcurrentProducer drives the exact +// documented scenario — the SenderErrorHandler calls Close() — with a +// genuinely concurrent producer goroutine still building rows. It is the +// -race companion to the deterministic tests above: under -race +// (which CI runs) the pre-fix build reports the data race between the +// dispatcher goroutine's closeCursor and the producer's table-buffer / +// row-state mutations; either way the producer must not panic. +func TestQwpSenderCloseFromErrorHandlerConcurrentProducer(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusSchemaMismatch}) + defer srv.Close() + + // autoFlushRows=1: every row is flushed, so frames keep reaching the + // (drop-policy) server and the handler keeps having reason to fire. + s, _, loop, cleanup := newCursorSenderForTest(t, srv, 1) + defer cleanup() + + closed := make(chan struct{}) + var once sync.Once + loop.sendLoopSetErrorHandler(func(e *SenderError) { + once.Do(func() { + _ = s.Close(context.Background()) + close(closed) + }) + }, 16) + + var prodPanic atomic.Value + prodDone := make(chan struct{}) + go func() { + defer close(prodDone) + defer func() { + if r := recover(); r != nil { + prodPanic.Store(fmt.Sprintf("%v", r)) + } + }() + ctx := context.Background() + for i := 0; i < 100000; i++ { + // A fresh table per row keeps the tableBuffers map churning, + // maximizing overlap with closeCursor's map range. + tbl := fmt.Sprintf("t%d", i) + if err := s.Table(tbl).Int64Column("v", int64(i)).AtNow(ctx); err != nil { + return // closed-sender or terminal error: producer stops cleanly + } + } + }() + + select { + case <-closed: + case <-time.After(10 * time.Second): + t.Fatal("handler never fired / Close() never called") + } + + select { + case <-prodDone: + case <-time.After(10 * time.Second): + t.Fatal("producer goroutine did not stop after Close()") + } + require.Nil(t, prodPanic.Load(), + "producer crashed racing a handler-invoked Close(): %v", prodPanic.Load()) +} diff --git a/qwp_sender_test.go b/qwp_sender_test.go index c9a1786d..1705f79b 100644 --- a/qwp_sender_test.go +++ b/qwp_sender_test.go @@ -28,6 +28,7 @@ import ( "bytes" "context" "encoding/binary" + "errors" "fmt" "math/big" "net/http" @@ -71,13 +72,31 @@ func newQwpTestServer(t *testing.T) *httptest.Server { func newQwpSenderForTest(t *testing.T, serverURL string) *qwpLineSender { t.Helper() wsURL := "ws" + strings.TrimPrefix(serverURL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatalf("newQwpLineSender: %v", err) } return s } +// flushAndAwaitAck flushes pending rows and blocks until the server +// has ACKed them. Flush no longer waits for the ACK (Java decision +// #1 — see design/qwp-cursor-durability.md), so tests that assert +// server-side receipt must use this FlushAndGetSequence + +// AwaitAckedFsn barrier instead of relying on Flush alone. +func flushAndAwaitAck(t *testing.T, s *qwpLineSender) { + t.Helper() + fsn, err := s.FlushAndGetSequence(context.Background()) + if err != nil { + t.Fatalf("FlushAndGetSequence: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := s.AwaitAckedFsn(ctx, fsn); err != nil { + t.Fatalf("AwaitAckedFsn(fsn=%d): %v", fsn, err) + } +} + func TestQwpSenderBasicRow(t *testing.T) { srv := newQwpTestServer(t) defer srv.Close() @@ -107,10 +126,11 @@ func TestQwpSenderBasicRow(t *testing.T) { } } -// TestQwpSyncFlushAbsorbsStaleAck verifies that sync-mode flushSync -// ignores an ACK whose cumulative sequence is older than the batch it -// just sent and keeps reading until the matching ACK arrives. Matches -// Java's waitForAck, which tolerates stale ACKs on the same connection. +// TestQwpSyncFlushAbsorbsStaleAck verifies that the cursor send +// loop tolerates an ACK whose cumulative sequence is older than the +// most recent published batch and keeps making forward progress. +// engineAcknowledge is monotonic — it clamps to ackedFsn — so stale +// ACKs are absorbed without breaking the engine's drain accounting. func TestQwpSyncFlushAbsorbsStaleAck(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set(qwpHeaderVersion, "1") @@ -146,9 +166,90 @@ func TestQwpSyncFlushAbsorbsStaleAck(t *testing.T) { t.Fatalf("flush %d: %v", i, err) } } +} + +// TestQwpFlushRetainsRowsOnError is a regression test for the +// retain-on-error contract: when flushCursor fails before the rows +// are persisted to a segment (here: ctx cancelled, so +// engineAppendBlocking returns ctx.Err() before assigning an FSN), +// Flush must NOT reset the table buffers. A prior version registered +// `defer resetAfterFlush()` ahead of the flushCursor error check, +// silently destroying rows that were never sent anywhere. The buffer +// must survive so a subsequent flush delivers the data. +func TestQwpFlushRetainsRowsOnError(t *testing.T) { + var mu sync.Mutex + framesReceived := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + var seq int64 + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + mu.Lock() + framesReceived++ + mu.Unlock() + conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) + seq++ + } + })) + defer srv.Close() + + s := newQwpSenderForTest(t, srv.URL) + defer s.Close(context.Background()) + + if err := s.Table("t").Int64Column("x", 99).AtNow(context.Background()); err != nil { + t.Fatalf("AtNow: %v", err) + } + if s.pendingRowCount != 1 { + t.Fatalf("pendingRowCount before flush = %d, want 1", s.pendingRowCount) + } + + // Cancelled ctx → engineAppendBlocking returns early, nothing + // persisted. The flush must fail and the row must be retained. + cancelled, cancel := context.WithCancel(context.Background()) + cancel() + if err := s.Flush(cancelled); err == nil { + t.Fatal("Flush with cancelled ctx: want error, got nil") + } + if s.pendingRowCount != 1 { + t.Fatalf("pendingRowCount after failed flush = %d, want 1 "+ + "(row destroyed — retain-on-error contract violated)", s.pendingRowCount) + } + mu.Lock() + got := framesReceived + mu.Unlock() + if got != 0 { + t.Fatalf("server received %d frames from the failed flush, want 0", got) + } - if got := s.syncSequence; got != 3 { - t.Fatalf("syncSequence = %d, want 3", got) + // The retained row must be delivered by a subsequent good flush. + // Flush no longer blocks on the ACK (Java decision #1), so use + // FlushAndGetSequence + AwaitAckedFsn as the delivery barrier + // before asserting receipt. + fsn, err := s.FlushAndGetSequence(context.Background()) + if err != nil { + t.Fatalf("retry Flush: %v", err) + } + if s.pendingRowCount != 0 { + t.Fatalf("pendingRowCount after retry flush = %d, want 0", s.pendingRowCount) + } + awaitCtx, awaitCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer awaitCancel() + if err := s.AwaitAckedFsn(awaitCtx, fsn); err != nil { + t.Fatalf("AwaitAckedFsn: %v", err) + } + mu.Lock() + got = framesReceived + mu.Unlock() + if got != 1 { + t.Fatalf("server received %d frames total, want exactly 1 "+ + "(retained row not delivered, or duplicated)", got) } } @@ -431,6 +532,23 @@ func TestQwpSenderClose(t *testing.T) { } } +func TestQwpSenderCloseSurfacesLatchedFluentError(t *testing.T) { + srv := newQwpTestServer(t) + defer srv.Close() + s := newQwpSenderForTest(t, srv.URL) + + // Latch a validation error: '?' is an illegal column-name char. + s.Table("t").Symbol("bad?name", "v") + + err := s.Close(context.Background()) + if err == nil { + t.Fatalf("Close: nil, expected latched fluent-API error") + } + if !strings.Contains(err.Error(), "illegal character") { + t.Fatalf("Close: %v, want error mentioning illegal character", err) + } +} + func TestQwpSenderClosedOperations(t *testing.T) { srv := newQwpTestServer(t) defer srv.Close() @@ -451,9 +569,14 @@ func TestQwpSenderClosedOperations(t *testing.T) { } func TestQwpSenderAutoFlushRows(t *testing.T) { - // Mock server that counts received messages. + // Mock server that counts received messages and signals the + // test goroutine on every receive — cursor mode's auto-flush is + // asynchronous (send loop transmits in the background), so the + // test must wait for the server to observe the frame rather + // than poll on shared memory. var mu sync.Mutex msgCount := 0 + msgReceived := make(chan struct{}, 16) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set(qwpHeaderVersion, "1") conn, err := websocket.Accept(w, r, nil) @@ -473,12 +596,16 @@ func TestQwpSenderAutoFlushRows(t *testing.T) { mu.Unlock() conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) seq++ + select { + case msgReceived <- struct{}{}: + default: + } } })) defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 3, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 3, 0, nil) if err != nil { t.Fatal(err) } @@ -493,7 +620,13 @@ func TestQwpSenderAutoFlushRows(t *testing.T) { } } - // Auto-flush should have triggered at row 3. + // Auto-flush should have triggered at row 3. Block until the + // server signals it received that frame. + select { + case <-msgReceived: + case <-time.After(2 * time.Second): + t.Fatal("auto-flush frame did not reach the server within 2s") + } mu.Lock() gotMsgCount := msgCount mu.Unlock() @@ -506,7 +639,8 @@ func TestQwpSenderAutoFlushRows(t *testing.T) { } func TestQwpSenderAutoFlushTimeInterval(t *testing.T) { - // Mock server that counts received messages. + // Mock server that counts received messages and signals on + // every receive (see TestQwpSenderAutoFlushRows for rationale). var mu sync.Mutex msgCount := 0 readMsgCount := func() int { @@ -514,6 +648,7 @@ func TestQwpSenderAutoFlushTimeInterval(t *testing.T) { defer mu.Unlock() return msgCount } + msgReceived := make(chan struct{}, 16) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set(qwpHeaderVersion, "1") conn, err := websocket.Accept(w, r, nil) @@ -533,13 +668,17 @@ func TestQwpSenderAutoFlushTimeInterval(t *testing.T) { mu.Unlock() conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) seq++ + select { + case msgReceived <- struct{}{}: + default: + } } })) defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") // autoFlushRows=0 (disabled), autoFlushInterval=10ms. - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 10*time.Millisecond, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 10*time.Millisecond, nil) if err != nil { t.Fatal(err) } @@ -560,11 +699,17 @@ func TestQwpSenderAutoFlushTimeInterval(t *testing.T) { // Wait for the interval to expire. time.Sleep(20 * time.Millisecond) - // Second row: should trigger time-based auto-flush. + // Second row: triggers time-based auto-flush. Block until the + // server signals it received the frame. err = s.Table("t").Int64Column("x", int64(2)).AtNow(context.Background()) if err != nil { t.Fatalf("row 2: %v", err) } + select { + case <-msgReceived: + case <-time.After(2 * time.Second): + t.Fatal("time-based auto-flush did not reach the server within 2s") + } if got := readMsgCount(); got != 1 { t.Fatalf("after row 2: msgCount = %d, want 1 (time-based flush)", got) } @@ -579,7 +724,7 @@ func TestQwpSenderAutoFlushDisabled(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") // Both autoFlushRows=0 and autoFlushInterval=0 (disabled). - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -1234,11 +1379,17 @@ func TestQwpSenderMethodChaining(t *testing.T) { // --- Integration test --- -func TestQwpSenderIntegration(t *testing.T) { +// Renamed from TestQwpSenderIntegration to TestQwpIntegrationSender +// so the qwp-fuzz.yml workflow pattern ^TestQwp(Fuzz|Integration) +// actually catches it. Used to hard-code "ws://localhost:9000" and +// silently skip in CI; now goes through the shared fuzz fixture. +func TestQwpIntegrationSender(t *testing.T) { + qwpEnsureServer(t) ctx := context.Background() - s, err := newQwpLineSender(ctx, "ws://localhost:9000", qwpTransportOpts{}, time.Second, 0, 0, nil) + s, err := newQwpLineSender(ctx, "ws://"+qwpTestAddr, + qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { - t.Skipf("QuestDB not available: %v", err) + t.Fatalf("sender open against fixture %s: %v", qwpTestAddr, err) } defer s.Close(ctx) @@ -1260,7 +1411,10 @@ func TestQwpSenderIntegration(t *testing.T) { t.Fatalf("Flush: %v", err) } - // Second flush with same schema should use reference mode. + // Second flush against the same column set — cursor mode always + // emits FULL schema with schema_id=0 on every frame. The test + // exercises the steady-state flush path; the wire-format + // invariant is pinned in encoder tests. err = s.Table("qwp_sender_test"). Symbol("host", "test_host"). Int64Column("cpu", 99). @@ -1277,51 +1431,11 @@ func TestQwpSenderIntegration(t *testing.T) { t.Fatalf("Flush (row 2): %v", err) } - // Verify schema was registered (schema ID advanced past -1). - if s.maxSentSchemaId < 0 { - t.Fatal("maxSentSchemaId should have advanced after flush") - } - t.Log("QWP sender integration test passed") } // --- Validation tests --- -func TestQwpSenderSchemaIdCaching(t *testing.T) { - srv := newQwpTestServer(t) - defer srv.Close() - s := newQwpSenderForTest(t, srv.URL) - defer s.Close(context.Background()) - - // First flush: full schema; the table should be assigned - // schemaId 0 and it should be promoted to maxSentSchemaId. - s.Table("t").Int64Column("x", 1).AtNow(context.Background()) - s.Flush(context.Background()) - - tb := s.tableBuffers["t"] - if tb == nil || tb.schemaId != 0 { - t.Fatalf("first flush: table schemaId = %v, want 0", tb) - } - if s.maxSentSchemaId != 0 { - t.Fatalf("first flush: maxSentSchemaId = %d, want 0", s.maxSentSchemaId) - } - if s.nextSchemaId != 1 { - t.Fatalf("first flush: nextSchemaId = %d, want 1", s.nextSchemaId) - } - - // Second flush: same column set, should reuse schemaId and not - // allocate a new one. - s.Table("t").Int64Column("x", 2).AtNow(context.Background()) - s.Flush(context.Background()) - - if tb.schemaId != 0 { - t.Fatalf("second flush: schemaId = %d, want 0 (same column set)", tb.schemaId) - } - if s.nextSchemaId != 1 { - t.Fatalf("second flush: nextSchemaId = %d, want 1 (no new ID allocated)", s.nextSchemaId) - } -} - func TestQwpSenderSymbolDictAcrossFlushes(t *testing.T) { // Track sent messages to verify delta dict content. var messages [][]byte @@ -1346,7 +1460,7 @@ func TestQwpSenderSymbolDictAcrossFlushes(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -1361,9 +1475,12 @@ func TestQwpSenderSymbolDictAcrossFlushes(t *testing.T) { t.Fatalf("after flush 1: maxSentSymbolId = %d, want 1", s.maxSentSymbolId) } - // Flush 2: add symbol GOOG. + // Flush 2: add symbol GOOG. Await delivery — Flush no longer + // blocks on the ACK, and the message-bytes assertions below + // require both frames to have reached the server. Awaiting the + // second FSN implies the first is delivered too (FSN monotonic). s.Table("t").Symbol("sym", "GOOG").Int64Column("v", 3).AtNow(context.Background()) - s.Flush(context.Background()) + flushAndAwaitAck(t, s) if s.maxSentSymbolId != 2 { t.Fatalf("after flush 2: maxSentSymbolId = %d, want 2", s.maxSentSymbolId) @@ -1392,26 +1509,21 @@ func TestQwpSenderSymbolDictAcrossFlushes(t *testing.T) { t.Fatalf("msg1 deltaCount = %d, want 2", deltaCount) } - // Parse second message: delta should start at 2 with count 1. + // Cursor mode emits self-sufficient frames: every batch carries + // the full symbol dict from id 0. So the second message also + // has deltaStart=0 (NOT 2), with all three symbols repeated. + // This is the documented "self-sufficient frames" decision (see + // design/qwp-cursor-durability.md decision #14). msg2 := messages[1] off = qwpHeaderSize deltaStart2, n, _ := qwpReadVarint(msg2[off:]) off += n - if deltaStart2 != 2 { - t.Fatalf("msg2 deltaStart = %d, want 2", deltaStart2) - } - deltaCount2, n, _ := qwpReadVarint(msg2[off:]) - off += n - if deltaCount2 != 1 { - t.Fatalf("msg2 deltaCount = %d, want 1", deltaCount2) + if deltaStart2 != 0 { + t.Fatalf("msg2 deltaStart = %d, want 0 (cursor mode is self-sufficient)", deltaStart2) } - - // Verify the new symbol is "GOOG". - symLen, n, _ := qwpReadVarint(msg2[off:]) - off += n - sym := string(msg2[off : off+int(symLen)]) - if sym != "GOOG" { - t.Fatalf("msg2 delta symbol = %q, want %q", sym, "GOOG") + deltaCount2, _, _ := qwpReadVarint(msg2[off:]) + if deltaCount2 != 3 { + t.Fatalf("msg2 deltaCount = %d, want 3 (full dict re-sent)", deltaCount2) } } @@ -1428,10 +1540,12 @@ func TestQwpSenderServerError(t *testing.T) { if err != nil { return } - // Return WRITE_ERROR. - errMsg := "table error" + // Return PARSE_ERROR (default Halt). WRITE_ERROR is now + // default Drop and would not surface a terminal Flush + // error. + errMsg := "bad message" ack := make([]byte, 11+len(errMsg)) - ack[0] = byte(qwpStatusWriteError) + ack[0] = byte(QwpStatusParseError) binary.LittleEndian.PutUint16(ack[9:11], uint16(len(errMsg))) copy(ack[11:], errMsg) conn.Write(context.Background(), websocket.MessageBinary, ack) @@ -1440,24 +1554,33 @@ func TestQwpSenderServerError(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } defer s.Close(context.Background()) s.Table("t").Int64Column("x", 1).AtNow(context.Background()) - err = s.Flush(context.Background()) + // Flush no longer waits for the ACK, so the server's PARSE_ERROR + // surfaces on the ACK-confirmation path (or, racily, already on + // FlushAndGetSequence). Accept it from either. + fsn, err := s.FlushAndGetSequence(context.Background()) + if err == nil { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err = s.AwaitAckedFsn(ctx, fsn) + } if err == nil { t.Fatal("expected error from server") } - qErr, ok := err.(*QwpError) - if !ok { - t.Fatalf("expected *QwpError, got %T: %v", err, err) + var senderErr *SenderError + if !errors.As(err, &senderErr) { + t.Fatalf("expected *SenderError in chain, got %T: %v", err, err) } - if qErr.Status != qwpStatusWriteError { - t.Fatalf("status = %d, want %d", qErr.Status, qwpStatusWriteError) + if senderErr.ServerStatusByte != int(QwpStatusParseError) { + t.Fatalf("status = 0x%02X, want 0x%02X", + senderErr.ServerStatusByte, byte(QwpStatusParseError)) } } @@ -1490,14 +1613,14 @@ func TestQwpSenderAsyncBasic(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 2) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 2) if err != nil { t.Fatal(err) } - // Verify async mode is enabled. - if s.asyncState == nil { - t.Fatal("asyncState should not be nil for window=2") + // Verify the cursor engine is wired (memory-backed, no sf_dir). + if s.cursorEngine == nil || s.cursorSendLoop == nil { + t.Fatal("cursor engine and send loop must be wired for QWP sender") } // Send 5 rows. @@ -1508,10 +1631,9 @@ func TestQwpSenderAsyncBasic(t *testing.T) { } } - // Flush — waits for all batches to be ACKed. - if err := s.Flush(context.Background()); err != nil { - t.Fatalf("Flush: %v", err) - } + // Flush, then await ACK — Flush itself no longer blocks on the + // server round-trip; the msgCount assertion needs delivery. + flushAndAwaitAck(t, s) if s.pendingRowCount != 0 { t.Fatalf("pendingRowCount = %d, want 0", s.pendingRowCount) @@ -1555,7 +1677,7 @@ func TestQwpSenderAsyncMultipleFlushes(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 3) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 3) if err != nil { t.Fatal(err) } @@ -1569,13 +1691,13 @@ func TestQwpSenderAsyncMultipleFlushes(t *testing.T) { t.Fatalf("Flush 1: %v", err) } - // Flush 2: 3 rows. + // Flush 2: 3 rows. Await the second FSN — that implies the first + // flush's frame is delivered too (FSN monotonic), so both frames + // have reached the server before the msgCount assertion. for i := 0; i < 3; i++ { s.Table("t").Int64Column("x", int64(i+10)).AtNow(context.Background()) } - if err := s.Flush(context.Background()); err != nil { - t.Fatalf("Flush 2: %v", err) - } + flushAndAwaitAck(t, s) mu.Lock() if msgCount != 2 { @@ -1589,7 +1711,7 @@ func TestQwpSenderAsyncCloseAutoFlush(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 2) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 2) if err != nil { t.Fatal(err) } @@ -1603,176 +1725,14 @@ func TestQwpSenderAsyncCloseAutoFlush(t *testing.T) { } } -func TestQwpSenderSchemaIdPerTable(t *testing.T) { - // Verify that two tables with identical columns both get full - // schema mode on first flush (not schema reference mode). - var messages [][]byte - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(qwpHeaderVersion, "1") - conn, err := websocket.Accept(w, r, nil) - if err != nil { - return - } - defer conn.CloseNow() - var seq int64 - for { - _, data, err := conn.Read(context.Background()) - if err != nil { - return - } - messages = append(messages, append([]byte(nil), data...)) - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(seq)) - seq++ - } - })) - defer srv.Close() - - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) - if err != nil { - t.Fatal(err) - } - defer s.Close(context.Background()) - - // Insert one row into each of two tables with identical columns. - s.Table("alpha").Int64Column("x", 1).AtNow(context.Background()) - s.Table("beta").Int64Column("x", 2).AtNow(context.Background()) - s.Flush(context.Background()) - - // With multi-table batching, both tables are in 1 message. - if len(messages) != 1 { - t.Fatalf("messages = %d, want 1", len(messages)) - } - - // Both tables in the message must use full schema mode. - modes := extractAllSchemaModes(t, messages[0]) - if len(modes) != 2 { - t.Fatalf("tables in message = %d, want 2", len(modes)) - } - for i, mode := range modes { - if mode != byte(qwpSchemaModeFull) { - t.Fatalf("table %d: schemaMode = 0x%02X, want 0x%02X (full)", - i, mode, qwpSchemaModeFull) - } - } - - // After first flush, both tables should have distinct schema IDs - // and maxSentSchemaId should have advanced to cover both. - if s.tableBuffers["alpha"].schemaId == s.tableBuffers["beta"].schemaId { - t.Fatalf("tables must have distinct schema IDs, both = %d", - s.tableBuffers["alpha"].schemaId) - } - if s.maxSentSchemaId != 1 { - t.Fatalf("maxSentSchemaId = %d, want 1", s.maxSentSchemaId) - } - if s.nextSchemaId != 2 { - t.Fatalf("nextSchemaId = %d, want 2", s.nextSchemaId) - } - - // Second flush of both tables should now use schema reference. - messages = messages[:0] - s.Table("alpha").Int64Column("x", 3).AtNow(context.Background()) - s.Table("beta").Int64Column("x", 4).AtNow(context.Background()) - s.Flush(context.Background()) - - if len(messages) != 1 { - t.Fatalf("messages = %d, want 1", len(messages)) - } - modes = extractAllSchemaModes(t, messages[0]) - for i, mode := range modes { - if mode != byte(qwpSchemaModeReference) { - t.Fatalf("table %d (2nd flush): schemaMode = 0x%02X, want 0x%02X (ref)", - i, mode, qwpSchemaModeReference) - } - } -} - -// extractAllSchemaModes parses a multi-table QWP message and returns -// the schema mode byte for each table block. It skips the header, -// delta dict, and then for each table: extracts the schema mode and -// skips the rest of the table block. -// -// Precondition: every table in the message has exactly one non-null -// LONG column. In full mode the helper asserts the type byte; in -// reference mode the caller is responsible for maintaining the same -// shape across flushes. The only caller today is -// TestQwpSenderSchemaIdPerTable, which uses Int64Column("x", ...). -func extractAllSchemaModes(t *testing.T, msg []byte) []byte { - t.Helper() - if len(msg) < qwpHeaderSize { - t.Fatalf("message too short: %d", len(msg)) - } - - tableCount := binary.LittleEndian.Uint16(msg[6:8]) - off := qwpHeaderSize - flags := msg[qwpHeaderOffsetFlags] - - // Skip delta dict if present. - if flags&qwpFlagDeltaSymbolDict != 0 { - _, n, _ := qwpReadVarint(msg[off:]) - off += n - deltaCount, n, _ := qwpReadVarint(msg[off:]) - off += n - for i := uint64(0); i < deltaCount; i++ { - slen, n, _ := qwpReadVarint(msg[off:]) - off += n + int(slen) - } - } - - var modes []byte - for ti := uint16(0); ti < tableCount; ti++ { - // Skip table name. - nameLen, n, _ := qwpReadVarint(msg[off:]) - off += n + int(nameLen) - // Row count — needed to size the column data skip. - rowCount, n, _ := qwpReadVarint(msg[off:]) - off += n - // Column count. - colCount, n, _ := qwpReadVarint(msg[off:]) - off += n - if colCount != 1 { - t.Fatalf("table %d: colCount=%d, helper only supports 1 column", - ti, colCount) - } - // Schema mode byte. - schemaMode := msg[off] - modes = append(modes, schemaMode) - off++ - // Schema ID varint (both modes per QWP spec §9). - _, n, _ = qwpReadVarint(msg[off:]) - off += n - - if schemaMode == byte(qwpSchemaModeFull) { - // Full schema: name string + type byte. - slen, n, _ := qwpReadVarint(msg[off:]) - off += n + int(slen) - if tc := qwpTypeCode(msg[off]); tc != qwpTypeLong { - t.Fatalf("table %d: column type=0x%02X, helper only supports qwpTypeLong", - ti, tc) - } - off++ - } - - // Column data: null bitmap flag (1 byte, asserted 0x00 = no - // nulls) followed by rowCount × 8 bytes for the LONG values. - if msg[off] != 0x00 { - t.Fatalf("table %d: null bitmap flag=0x%02X, helper requires non-null values", - ti, msg[off]) - } - off += 1 + int(rowCount)*8 - } - - return modes -} - func TestQwpAsyncSenderTerminalOnFlushFailure(t *testing.T) { - // In async mode the sender matches the Java client's - // flushPendingRows() semantics: schema and symbol IDs are - // advanced immediately after enqueue, not after ACK. If a batch - // later fails, the sender is poisoned via asyncState.ioErr and - // every subsequent user-facing call returns that error — so - // stale cache state can never reach the wire on a live - // connection. This test pins that invariant. + // The cursor sender matches the Java client's flushPendingRows() + // semantics: schema and symbol IDs are advanced immediately at + // enqueue, not after ACK. If a batch later fails, the send loop + // latches the terminal error (surfaced via sendLoopCheckError) and + // every subsequent user-facing call returns it — so stale cache + // state can never reach the wire on a live connection. This test + // pins that invariant. srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set(qwpHeaderVersion, "1") @@ -1782,18 +1742,20 @@ func TestQwpAsyncSenderTerminalOnFlushFailure(t *testing.T) { } defer conn.CloseNow() - // Read the first message, then return a WRITE_ERROR. + // Read the first message, then return a PARSE_ERROR + // (default Halt). WRITE_ERROR is now default Drop and would + // not poison the sender. _, _, err = conn.Read(context.Background()) if err != nil { return } - ack := buildAckError(qwpStatusWriteError, 0, "write failed") + ack := buildAckError(QwpStatusParseError, 0, "bad message") conn.Write(context.Background(), websocket.MessageBinary, ack) })) defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil, 2) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil, 2) if err != nil { t.Fatal(err) } @@ -1802,8 +1764,15 @@ func TestQwpAsyncSenderTerminalOnFlushFailure(t *testing.T) { // Insert a row with a symbol (to exercise both schema and symbol paths). s.Table("t").Symbol("sym", "AAPL").Int64Column("x", 1).AtNow(context.Background()) - // Flush returns the WRITE_ERROR from the server. - flushErr := s.Flush(context.Background()) + // Flush no longer waits for the ACK, so the server's PARSE_ERROR + // surfaces on the ACK-confirmation path (AwaitAckedFsn) or, racily, + // already on FlushAndGetSequence. Accept it from either. + fsn, flushErr := s.FlushAndGetSequence(context.Background()) + if flushErr == nil { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + flushErr = s.AwaitAckedFsn(ctx, fsn) + } if flushErr == nil { t.Fatal("expected flush error, got nil") } @@ -1851,7 +1820,7 @@ func TestQwpAsyncAutoFlushNonBlocking(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") // window=4, autoFlushRows=10 - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 10, 0, nil, 4) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 10, 0, nil, 4) if err != nil { t.Fatal(err) } @@ -1868,13 +1837,13 @@ func TestQwpAsyncAutoFlushNonBlocking(t *testing.T) { // All 30 rows have been inserted. The user goroutine returned // from AtNow without blocking. Verify that multiple batches are - // in-flight (enqueued but not yet ACKed). - s.asyncState.mu.Lock() - count := s.asyncState.inFlightCount - s.asyncState.mu.Unlock() - - if count < 2 { - t.Fatalf("expected at least 2 batches in-flight concurrently, got %d", count) + // in-flight (published into the engine but not yet ACKed). + pub := s.cursorEngine.enginePublishedFsn() + acked := s.cursorEngine.engineAckedFsn() + inFlight := pub - acked + if inFlight < 2 { + t.Fatalf("expected at least 2 batches in-flight concurrently, got %d (published=%d acked=%d)", + inFlight, pub, acked) } // Release the gate so the server can ACK all batches. @@ -1951,8 +1920,9 @@ func TestQwpAuthHeaderFormat(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") opts := qwpTransportOpts{ authorization: "Bearer my_token", + endpointPath: qwpWritePath, } - s, err := newQwpLineSender(context.Background(), wsURL, opts, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, opts, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -1988,8 +1958,9 @@ func TestQwpAuthHeaderFormat(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") opts := qwpTransportOpts{ authorization: "Basic YWRtaW46cXVlc3Q=", // base64("admin:quest") + endpointPath: qwpWritePath, } - s, err := newQwpLineSender(context.Background(), wsURL, opts, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, opts, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -2099,7 +2070,7 @@ func TestQwpMaxBufSizeTriggersFlush(t *testing.T) { defer srv.Close() wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{}, 0, 0, 0, nil) + s, err := newQwpLineSender(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, nil) if err != nil { t.Fatal(err) } @@ -2119,10 +2090,10 @@ func TestQwpMaxBufSizeTriggersFlush(t *testing.T) { } } - // Explicit flush for remaining rows. - if err := s.Flush(context.Background()); err != nil { - t.Fatalf("Flush: %v", err) - } + // Explicit flush for remaining rows, then await delivery — the + // messageCount assertion needs the frames on the wire, and Flush + // no longer blocks on the ACK. + flushAndAwaitAck(t, s) // We should have received at least 2 messages: one from the // maxBufSize-triggered flush and one from the explicit Flush. @@ -2378,3 +2349,62 @@ func TestQwpSenderAtAndAtNanoConflict(t *testing.T) { t.Fatalf("unexpected error: %v", err) } } + +// TestQwpSenderObservabilityCounters verifies the spec §20 counter +// accessors are wired through the QwpSender interface to the +// underlying send loop / engine / drainer pool. A fresh sender on a +// happy-path test server should report zero on every counter both +// before and after a successful flush, and BackgroundDrainers() +// should be nil on a memory-backed sender (no SF, no orphan +// adoption). +func TestQwpSenderObservabilityCounters(t *testing.T) { + srv := newQwpTestServer(t) + defer srv.Close() + s := newQwpSenderForTest(t, srv.URL) + defer s.Close(context.Background()) + + // Reach the accessors through the interface to lock the public + // surface in place — a missing method would fail to compile. + var qs QwpSender = s + + if got := qs.TotalReconnectAttempts(); got != 0 { + t.Fatalf("TotalReconnectAttempts on fresh sender = %d, want 0", got) + } + if got := qs.TotalReconnectsSucceeded(); got != 0 { + t.Fatalf("TotalReconnectsSucceeded on fresh sender = %d, want 0", got) + } + if got := qs.TotalFramesReplayed(); got != 0 { + t.Fatalf("TotalFramesReplayed on fresh sender = %d, want 0", got) + } + if got := qs.TotalBackpressureStalls(); got != 0 { + t.Fatalf("TotalBackpressureStalls on fresh sender = %d, want 0", got) + } + if got := qs.BackgroundDrainers(); got != nil { + t.Fatalf("BackgroundDrainers on memory-backed sender = %v, want nil", got) + } + + if err := qs.Table("t").Int64Column("v", 1).AtNow(context.Background()); err != nil { + t.Fatalf("AtNow: %v", err) + } + if err := qs.Flush(context.Background()); err != nil { + t.Fatalf("Flush: %v", err) + } + + // A clean flush against the happy-path server must not have + // triggered any reconnects, replays, or backpressure stalls. + if got := qs.TotalReconnectAttempts(); got != 0 { + t.Fatalf("TotalReconnectAttempts after clean flush = %d, want 0", got) + } + if got := qs.TotalReconnectsSucceeded(); got != 0 { + t.Fatalf("TotalReconnectsSucceeded after clean flush = %d, want 0", got) + } + if got := qs.TotalFramesReplayed(); got != 0 { + t.Fatalf("TotalFramesReplayed after clean flush = %d, want 0", got) + } + if got := qs.TotalBackpressureStalls(); got != 0 { + t.Fatalf("TotalBackpressureStalls after clean flush = %d, want 0", got) + } + if got := qs.BackgroundDrainers(); got != nil { + t.Fatalf("BackgroundDrainers after clean flush = %v, want nil", got) + } +} diff --git a/qwp_server_info.go b/qwp_server_info.go new file mode 100644 index 00000000..161e62e1 --- /dev/null +++ b/qwp_server_info.go @@ -0,0 +1,234 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import "fmt" + +// QwpServerInfo is the decoded SERVER_INFO frame the server emits as +// the first WebSocket frame after the upgrade handshake. The +// QwpQueryClient.ServerInfo() accessor returns nil when the client did +// not consume it (serverInfoTimeout disabled) or the server sent no +// parseable frame. +// +// All fields are populated from a single decode pass; the struct is +// immutable from the user's perspective and safe to share across +// goroutines once published. +type QwpServerInfo struct { + // Role is the server's replication role byte. Compare against the + // QwpRole* constants or feed to RoleName for a human-readable form. + // Drives target= filtering on multi-endpoint connections. + Role byte + // Epoch is a monotonic counter that advances across role + // transitions on the same node (replica → primary, primary → + // replica). Clients tracking a specific primary use it to refuse a + // stale reconnect that lands on a node which no longer believes it + // is primary at the current cluster epoch. 0 on releases without + // fencing wired up; treat as a hint. + Epoch uint64 + // Capabilities is the server capability bitfield from SERVER_INFO. + // The only bit currently defined is CAP_ZONE (QwpCapZone): when + // set, the frame carries a zone_id trailer after node_id. + Capabilities uint32 + // ServerWallNs is the server wall-clock at the time SERVER_INFO was + // emitted, in nanoseconds since the Unix epoch. + ServerWallNs int64 + // ClusterId is a free-form identifier supplied by the server + // operator. Surfaced in error messages and diagnostics. + ClusterId string + // NodeId is a free-form per-node identifier supplied by the server + // operator. Distinct nodes in the same cluster carry distinct + // values; surfaced in error messages and diagnostics. + NodeId string + // ZoneId is the server's zone identifier, populated when + // Capabilities & QwpCapZone is set (failover.md §2). The + // comparison against the client's configured zone= is + // case-insensitive. Empty when the server did not opt into + // CAP_ZONE; in that case the host's tracker tier stays Unknown. + ZoneId string +} + +// RoleName returns the human-readable name for the role byte. Unknown +// values surface as "UNKNOWN(n)" so diagnostics never lose information. +// Mirrors Java QwpServerInfo.roleName. +func (s *QwpServerInfo) RoleName() string { + return qwpRoleName(s.Role) +} + +// qwpRoleName is the package-internal helper that powers RoleName and +// the role-mismatch error formatter. Lives at package scope so callers +// without a populated *QwpServerInfo (e.g. role-mismatch error paths +// that have only the byte) can reuse the same names. +func qwpRoleName(role byte) string { + switch role { + case qwpRoleStandalone: + return "STANDALONE" + case qwpRolePrimary: + return "PRIMARY" + case qwpRoleReplica: + return "REPLICA" + case qwpRolePrimaryCatchup: + return "PRIMARY_CATCHUP" + default: + return fmt.Sprintf("UNKNOWN(0x%02X)", role) + } +} + +// String returns a human-readable summary of the SERVER_INFO contents. +// Used by diagnostics and error messages; not parsed. +func (s *QwpServerInfo) String() string { + return fmt.Sprintf( + "QwpServerInfo{role=%s, epoch=%d, clusterId=%q, nodeId=%q, capabilities=0x%X, serverWallNs=%d}", + s.RoleName(), s.Epoch, s.ClusterId, s.NodeId, s.Capabilities, s.ServerWallNs, + ) +} + +// decodeServerInfo parses the SERVER_INFO frame off the wire. The +// payload is the full QWP message (12-byte header + msg_kind + body) +// as delivered by the WebSocket transport. The decoder validates the +// magic / version / msg_kind, then reads the body fields little-endian +// with bounds checks on every length-prefixed string so a hostile +// u16 length cannot drag bytes from outside the frame. +// +// negotiatedVersion is the QWP wire-protocol version selected by the +// HTTP upgrade handshake. The frame's header version byte must equal +// it exactly — spec §3 forbids a version-byte mismatch on any +// server-to-client frame. +// +// Mirrors Java QwpServerInfoDecoder.decode. +func decodeServerInfo(payload []byte, negotiatedVersion byte) (*QwpServerInfo, error) { + if len(payload) < qwpHeaderSize+1 { + return nil, newQwpDecodeError(fmt.Sprintf( + "SERVER_INFO frame too short: %d bytes (need >= %d)", + len(payload), qwpHeaderSize+1)) + } + // Validate the QWP header before trusting any of the body bytes. + // Mirrors parseFrameHeader's guards in qwp_query_decoder.go but + // avoids the decoder's per-frame state writes (deltaOn / gorillaOn + // / zstdOn) since SERVER_INFO carries none of those flags. + if magic := uint32(payload[0]) | uint32(payload[1])<<8 | + uint32(payload[2])<<16 | uint32(payload[3])<<24; magic != qwpMagic { + return nil, newQwpDecodeError(fmt.Sprintf( + "SERVER_INFO bad magic 0x%08X", magic)) + } + if payload[4] != negotiatedVersion { + return nil, newQwpDecodeError(fmt.Sprintf( + "SERVER_INFO frame version %d does not match negotiated version %d", + payload[4], negotiatedVersion)) + } + // Spec §4: table_count is 0 on every non-RESULT_BATCH frame. + tableCount := uint16(payload[qwpHeaderOffsetTableCount]) | + uint16(payload[qwpHeaderOffsetTableCount+1])<<8 + if tableCount != 0 { + return nil, newQwpDecodeError(fmt.Sprintf( + "SERVER_INFO frame table_count = %d, expected 0", tableCount)) + } + + br := qwpByteReader{} + br.reset(payload[qwpHeaderSize:]) + kindByte, err := br.readByte() + if err != nil { + return nil, err + } + if qwpMsgKind(kindByte) != qwpMsgKindServerInfo { + return nil, newQwpDecodeError(fmt.Sprintf( + "expected SERVER_INFO msg_kind 0x%02X, got 0x%02X", + byte(qwpMsgKindServerInfo), kindByte)) + } + role, err := br.readByte() + if err != nil { + return nil, err + } + epoch, err := br.readUint64LE() + if err != nil { + return nil, err + } + capabilities, err := br.readUint32LE() + if err != nil { + return nil, err + } + serverWallNs, err := br.readInt64LE() + if err != nil { + return nil, err + } + clusterId, err := readUtf8U16(&br, "cluster_id") + if err != nil { + return nil, err + } + nodeId, err := readUtf8U16(&br, "node_id") + if err != nil { + return nil, err + } + // Optional zone_id, gated by CAP_ZONE in capabilities. Servers + // that haven't opted into CAP_ZONE end the frame at node_id; + // servers that have opted in append a u16-length-prefixed UTF-8 + // zone identifier (failover.md §5). The reader's bounds checks + // in readUtf8U16 guard against a hostile length declaring more + // bytes than the frame contains. + var zoneId string + if capabilities&qwpCapZone != 0 { + zoneId, err = readUtf8U16(&br, "zone_id") + if err != nil { + return nil, err + } + } + return &QwpServerInfo{ + Role: role, + Epoch: epoch, + Capabilities: capabilities, + ServerWallNs: serverWallNs, + ClusterId: clusterId, + NodeId: nodeId, + ZoneId: zoneId, + }, nil +} + +// readUtf8U16 reads a u16-length-prefixed UTF-8 string from the +// reader. The length is bounds-checked against the reader's remaining +// bytes before allocation so a hostile length cannot trigger an +// out-of-bounds slice. fieldName is woven into the error for +// diagnostic clarity. +func readUtf8U16(br *qwpByteReader, fieldName string) (string, error) { + n, err := br.readUint16LE() + if err != nil { + return "", wrapQwpDecodeError( + fmt.Sprintf("SERVER_INFO truncated reading %s length", fieldName), + err) + } + if int(n) > br.remaining() { + return "", newQwpDecodeError(fmt.Sprintf( + "SERVER_INFO %s length %d exceeds frame remainder %d", + fieldName, n, br.remaining())) + } + if n == 0 { + return "", nil + } + bytes, err := br.slice(int(n)) + if err != nil { + return "", err + } + // Copy out of the aliasing slice so the returned string survives + // the recv buffer's lifecycle. + return string(bytes), nil +} diff --git a/qwp_server_info_test.go b/qwp_server_info_test.go new file mode 100644 index 00000000..2c76d887 --- /dev/null +++ b/qwp_server_info_test.go @@ -0,0 +1,295 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "strings" + "testing" +) + +func TestQwpServerInfoRoleName(t *testing.T) { + cases := []struct { + role byte + want string + }{ + {qwpRoleStandalone, "STANDALONE"}, + {qwpRolePrimary, "PRIMARY"}, + {qwpRoleReplica, "REPLICA"}, + {qwpRolePrimaryCatchup, "PRIMARY_CATCHUP"}, + {0xFF, "UNKNOWN(0xFF)"}, + {0x42, "UNKNOWN(0x42)"}, + } + for _, tc := range cases { + got := qwpRoleName(tc.role) + if got != tc.want { + t.Errorf("qwpRoleName(0x%02X) = %q, want %q", tc.role, got, tc.want) + } + s := &QwpServerInfo{Role: tc.role} + if s.RoleName() != tc.want { + t.Errorf("(*QwpServerInfo).RoleName() = %q, want %q", s.RoleName(), tc.want) + } + } +} + +// buildServerInfoFrame produces a full SERVER_INFO QWP message (12-byte +// header + body) for tests. flagBits is OR-ed onto the header flags so +// negative tests can craft hostile shapes; pass 0 for the conformant +// frame servers actually emit. +func buildServerInfoFrame(version byte, flagBits byte, role byte, epoch uint64, capabilities uint32, serverWallNs int64, clusterId, nodeId string) []byte { + body := []byte{} + body = append(body, byte(qwpMsgKindServerInfo)) + body = append(body, role) + body = appendUint64LE(body, epoch) + body = appendUint32LE(body, capabilities) + body = appendInt64LE(body, serverWallNs) + body = appendUint16LE(body, uint16(len(clusterId))) + body = append(body, clusterId...) + body = appendUint16LE(body, uint16(len(nodeId))) + body = append(body, nodeId...) + + header := make([]byte, qwpHeaderSize) + // magic + magic := uint32(qwpMagic) + header[0] = byte(magic) + header[1] = byte(magic >> 8) + header[2] = byte(magic >> 16) + header[3] = byte(magic >> 24) + header[4] = version + header[qwpHeaderOffsetFlags] = flagBits + // tableCount (uint16 LE) at offset 6. Spec §4 mandates 0 on every + // non-RESULT_BATCH kind, including SERVER_INFO; leaving the bytes + // zero satisfies that. + // payloadLen (uint32 LE) at offset 8. + payloadLen := uint32(len(body)) + header[qwpHeaderOffsetPayloadLen] = byte(payloadLen) + header[qwpHeaderOffsetPayloadLen+1] = byte(payloadLen >> 8) + header[qwpHeaderOffsetPayloadLen+2] = byte(payloadLen >> 16) + header[qwpHeaderOffsetPayloadLen+3] = byte(payloadLen >> 24) + return append(header, body...) +} + +func appendUint16LE(buf []byte, v uint16) []byte { + return append(buf, byte(v), byte(v>>8)) +} + +func appendUint32LE(buf []byte, v uint32) []byte { + return append(buf, byte(v), byte(v>>8), byte(v>>16), byte(v>>24)) +} + +func appendUint64LE(buf []byte, v uint64) []byte { + return append(buf, + byte(v), byte(v>>8), byte(v>>16), byte(v>>24), + byte(v>>32), byte(v>>40), byte(v>>48), byte(v>>56)) +} + +func appendInt64LE(buf []byte, v int64) []byte { + return appendUint64LE(buf, uint64(v)) +} + +func TestQwpServerInfoDecodeHappyPath(t *testing.T) { + frame := buildServerInfoFrame( + qwpVersion, 0, + qwpRolePrimary, 7, 0, 1_700_000_000_000_000_000, + "cluster-A", "node-1", + ) + info, err := decodeServerInfo(frame, qwpVersion) + if err != nil { + t.Fatalf("decodeServerInfo: %v", err) + } + if info.Role != qwpRolePrimary { + t.Errorf("Role = 0x%02X, want PRIMARY", info.Role) + } + if info.Epoch != 7 { + t.Errorf("Epoch = %d, want 7", info.Epoch) + } + if info.Capabilities != 0 { + t.Errorf("Capabilities = %d, want 0", info.Capabilities) + } + if info.ServerWallNs != 1_700_000_000_000_000_000 { + t.Errorf("ServerWallNs = %d", info.ServerWallNs) + } + if info.ClusterId != "cluster-A" { + t.Errorf("ClusterId = %q", info.ClusterId) + } + if info.NodeId != "node-1" { + t.Errorf("NodeId = %q", info.NodeId) + } +} + +func TestQwpServerInfoDecodeEmptyIdentifiers(t *testing.T) { + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRoleStandalone, 0, 0, 0, "", "") + info, err := decodeServerInfo(frame, qwpVersion) + if err != nil { + t.Fatalf("decodeServerInfo: %v", err) + } + if info.ClusterId != "" { + t.Errorf("ClusterId = %q, want empty", info.ClusterId) + } + if info.NodeId != "" { + t.Errorf("NodeId = %q, want empty", info.NodeId) + } +} + +func TestQwpServerInfoDecodeRejectsVersionMismatch(t *testing.T) { + // Spec §3 requires the SERVER_INFO header version byte to equal the + // version negotiated during the HTTP upgrade. QWP has a single + // version (qwpVersion), so the decoder rejects any frame whose + // header version byte differs from the negotiated value. + cases := []struct { + name string + frameVersion byte + negotiatedVersion byte + }{ + {"frame_v0_conn_v1", 0x00, qwpVersion}, + {"frame_v2_conn_v1", 0x02, qwpVersion}, + {"too_new_frame", 0xFF, qwpVersion}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + frame := buildServerInfoFrame(tc.frameVersion, 0, + qwpRoleStandalone, 0, 0, 0, "", "") + _, err := decodeServerInfo(frame, tc.negotiatedVersion) + if err == nil { + t.Fatalf("decoder accepted version=0x%02X on a "+ + "v0x%02X-negotiated connection", + tc.frameVersion, tc.negotiatedVersion) + } + if !strings.Contains(err.Error(), "does not match negotiated version") { + t.Errorf("error = %v, want version-mismatch message", err) + } + }) + } +} + +func TestQwpServerInfoDecodeRejectsBadMagic(t *testing.T) { + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRoleStandalone, 0, 0, 0, "", "") + frame[0] = 0x00 // corrupt magic + _, err := decodeServerInfo(frame, qwpVersion) + if err == nil { + t.Fatal("decoder accepted bad magic") + } + if !strings.Contains(err.Error(), "bad magic") { + t.Errorf("error = %v, want bad magic", err) + } +} + +func TestQwpServerInfoDecodeRejectsNonZeroTableCount(t *testing.T) { + // Spec §4 mandates table_count = 0 on every non-RESULT_BATCH + // frame. SERVER_INFO is no exception — a server that smuggles a + // non-zero value here is malformed and must be rejected before any + // body bytes are trusted. + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRoleStandalone, 0, 0, 0, "", "") + frame[qwpHeaderOffsetTableCount] = 1 + _, err := decodeServerInfo(frame, qwpVersion) + if err == nil { + t.Fatal("decoder accepted non-zero table_count") + } + if !strings.Contains(err.Error(), "table_count") { + t.Errorf("error = %v, want table_count", err) + } +} + +func TestQwpServerInfoDecodeRejectsWrongMsgKind(t *testing.T) { + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRoleStandalone, 0, 0, 0, "", "") + frame[qwpHeaderSize] = byte(qwpMsgKindResultBatch) + _, err := decodeServerInfo(frame, qwpVersion) + if err == nil { + t.Fatal("decoder accepted wrong msg_kind") + } + if !strings.Contains(err.Error(), "expected SERVER_INFO msg_kind") { + t.Errorf("error = %v, want expected SERVER_INFO msg_kind", err) + } +} + +func TestQwpServerInfoDecodeRejectsTruncatedFrame(t *testing.T) { + // Try truncating at every offset from 0 through one short of full + // frame length; every truncation should produce a decode error. + full := buildServerInfoFrame(qwpVersion, 0, + qwpRolePrimary, 5, 0, 1234, "abc", "n1") + for cut := 0; cut < len(full); cut++ { + _, err := decodeServerInfo(full[:cut], qwpVersion) + if err == nil { + t.Errorf("truncated frame of length %d decoded without error", cut) + } + } +} + +func TestQwpServerInfoDecodeRejectsOversizedClusterId(t *testing.T) { + // Hand-craft a frame whose cluster_id u16 length claims more + // bytes than the frame contains. + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRolePrimary, 0, 0, 0, "abc", "node") + // cluster_id length lives at qwpHeaderSize + 1 (kind) + 1 (role) + // + 8 (epoch) + 4 (caps) + 8 (wallNs). + clusterLenOffset := qwpHeaderSize + 1 + 1 + 8 + 4 + 8 + frame[clusterLenOffset] = 0xFF + frame[clusterLenOffset+1] = 0xFF + _, err := decodeServerInfo(frame, qwpVersion) + if err == nil { + t.Fatal("decoder accepted oversized cluster_id length") + } + if !strings.Contains(err.Error(), "exceeds frame remainder") { + t.Errorf("error = %v, want exceeds frame remainder", err) + } +} + +func TestQwpServerInfoDecodeRejectsOversizedNodeId(t *testing.T) { + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRolePrimary, 0, 0, 0, "abc", "node") + // node_id length lives right after cluster_id bytes. cluster_id + // is "abc" (3 bytes) so node_id length offset = clusterLenOffset + // + 2 + 3. + nodeLenOffset := qwpHeaderSize + 1 + 1 + 8 + 4 + 8 + 2 + 3 + frame[nodeLenOffset] = 0xFF + frame[nodeLenOffset+1] = 0xFF + _, err := decodeServerInfo(frame, qwpVersion) + if err == nil { + t.Fatal("decoder accepted oversized node_id length") + } + if !strings.Contains(err.Error(), "exceeds frame remainder") { + t.Errorf("error = %v, want exceeds frame remainder", err) + } +} + +func TestQwpServerInfoStringContainsKeyFields(t *testing.T) { + info := &QwpServerInfo{ + Role: qwpRolePrimary, + Epoch: 42, + Capabilities: 0xCAFE, + ServerWallNs: 1234567890, + ClusterId: "alpha", + NodeId: "beta", + } + s := info.String() + for _, want := range []string{"PRIMARY", "epoch=42", "alpha", "beta", "0xCAFE"} { + if !strings.Contains(s, want) { + t.Errorf("String() = %q missing %q", s, want) + } + } +} diff --git a/qwp_serverinfo_probe_test.go b/qwp_serverinfo_probe_test.go new file mode 100644 index 00000000..c79fd047 --- /dev/null +++ b/qwp_serverinfo_probe_test.go @@ -0,0 +1,98 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestQwpServerInfoIsEgressOnly pins a wire-protocol invariant: the +// server delivers an unsolicited SERVER_INFO frame as the first frame +// only on the egress endpoint (/read/v1). The ingest endpoint +// (/write/v4) sends no SERVER_INFO and the client never expects one — +// it sends data right after the upgrade and the first inbound frame is +// an ACK. This is why the ingest path leaves serverInfoTimeout=0 and +// readAck() reads the first frame as an ACK without skipping a +// SERVER_INFO. +// +// The test opts into a synchronous first-frame read (serverInfoTimeout +// > 0) on each endpoint: +// - egress MUST return a SERVER_INFO frame (control: proves the probe +// and server are healthy); +// - ingest MUST time out the post-upgrade read (the server sends +// nothing until the client speaks). +// +// If the ingest assertion ever fails, the server has started emitting +// SERVER_INFO on /write/v4, and the ingest read path must be changed to +// consume and discard it before the ACK loop. Source of truth: +// connect/wire-protocols/qwp-{ingress,egress}-websocket.md. +// +// Run against a live server, e.g.: +// +// QDB_FUZZ_ADDR=localhost:9000 go test -v -run TestQwpServerInfoIsEgressOnly . +func TestQwpServerInfoIsEgressOnly(t *testing.T) { + qwpEnsureServer(t) + + probe := func(label, path string) (*QwpServerInfo, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + var tr qwpTransport + err := tr.connect(ctx, "ws://"+qwpTestAddr, qwpTransportOpts{ + endpointPath: path, + maxVersion: qwpVersion, + serverInfoTimeout: 3 * time.Second, + }) + defer tr.close() + + t.Logf("[%-7s %s]: serverInfo=%v err=%v", label, path, tr.serverInfo, err) + return tr.serverInfo, err + } + + // Control: egress must deliver SERVER_INFO as the first frame. + egInfo, egErr := probe("egress", qwpReadPath) + require.NoError(t, egErr, "egress control: SERVER_INFO read should succeed on /read/v1") + require.NotNil(t, egInfo, + "egress control: expected a SERVER_INFO frame on /read/v1 (if nil, the probe itself is broken)") + + // Invariant: ingest must NOT deliver SERVER_INFO. The upgrade + // succeeds, then the first-frame read times out because the server + // sends nothing until the client does. "SERVER_INFO read failed" in + // the error confirms the upgrade completed and it is the post-upgrade + // read that timed out (an upgrade reject would surface a different + // error before this point). + inInfo, inErr := probe("ingest", qwpWritePath) + require.Nil(t, inInfo, + "ingest must NOT receive SERVER_INFO (spec: ingress is role/zone-blind). "+ + "If non-nil, the server now emits SERVER_INFO on /write/v4 and the ingest "+ + "read path must consume/discard it before the ACK loop.") + require.Error(t, inErr, "ingest: the post-upgrade first-frame read must time out") + require.Contains(t, inErr.Error(), "SERVER_INFO read failed", + "ingest: upgrade should succeed, then the first-frame read should time out") +} diff --git a/qwp_sf_ack_watermark.go b/qwp_sf_ack_watermark.go new file mode 100644 index 00000000..01a81677 --- /dev/null +++ b/qwp_sf_ack_watermark.go @@ -0,0 +1,287 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "io" + "math" + "os" + "path/filepath" + "sync" +) + +// qwpSfAckWatermark is the persisted high-water mark for the +// durably-acknowledged FSN. It lives at `/.ack-watermark` +// alongside the segment files and the slot lock, and is read at +// engine startup to seed ackedFsn — eliminating the segment-granular +// re-replay of partially-acked sealed segments across process +// restarts and across orphan adoption by a different client. +// +// The on-disk format is normative and interchangeable with the Java +// client's AckWatermark.java (sf-client.md §5.4, §19): 16 bytes, +// little-endian. +// +// offset 0: u32 magic = 0x31574B41 ('AKW1', stamped on first write) +// offset 4: u32 reserved (zero) +// offset 8: i64 fsn (cumulative durable-ack high-water mark) +// +// Durable acks are cumulative ("everything <= N is durable"), so a +// single monotonic FSN suffices; no per-frame bitmap is needed. +// +// Why the file is OPTIONAL but format normative: a missing or +// bad-magic file makes read() report qwpSfAckWatermarkInvalid and +// recovery falls back to the bare lowestBase-1 seed (no regression). +// A drainer adopting a slot another client populated MUST honour an +// existing watermark; ignoring it re-replays already-durable frames, +// producing row-level duplicates against a still-alive server. +// +// Why no CRC: a stale-low watermark only means more re-replay, and a +// stale-high watermark is rejected by the recovery path's +// max(lowestBase-1, watermark) clamp + publishedFsn bound. fsync is +// intentionally NOT performed — a host crash falls back to the +// segment-derived seed, same as before this feature (no regression). +// +// Concurrency: single-writer after construction (the segment-manager +// goroutine, via persistIfAdvanced). read() runs once at engine +// startup before the manager observes the entry. The mutex guards +// every access against close() so a manager tick that races a slow +// engine shutdown can never store into an unmapped region — Go can't +// lean on the JVM single-thread argument the Java reference uses, and +// an unguarded store-after-munmap is a hard SIGSEGV (and a -race +// failure). The lock is uncontended in the steady state and is off +// the producer hot path entirely (manager-tick cadence), so it does +// not affect BenchmarkQwpSenderSteadyState. +type qwpSfAckWatermark struct { + mu sync.Mutex + file *os.File + buf []byte + closed bool + + // magicWritten flips once — at open() if a prior session already + // stamped the magic, or on the first store that observes it unset. + // After it flips, stores degenerate to a single 8-byte FSN put. + magicWritten bool + + // lastPersistedAck is the highest FSN written so far this session. + // Gates persistIfAdvanced so a steady ackedFsn doesn't dirty the + // mapped page every manager tick. -1 until the first store. + lastPersistedAck int64 +} + +// qwpSfAckWatermark on-disk constants. The magic and offsets are +// normative — they MUST match the Java client so a slot written by +// one client is honoured by a drainer from the other. +const ( + qwpSfAckWatermarkFileName = ".ack-watermark" + qwpSfAckWatermarkFileSize int64 = 16 + // qwpSfAckWatermarkMagic is 'AKW1' little-endian. A different + // value at offset 0 means "no usable watermark" (freshly + // zero-filled file, or corruption) and read() reports INVALID. + qwpSfAckWatermarkMagic uint32 = 0x31574B41 + qwpSfAckWatermarkMagicOffset int64 = 0 + qwpSfAckWatermarkFsnOffset int64 = 8 +) + +// qwpSfAckWatermarkInvalid is the sentinel read() returns when the +// file has never been written (magic unset because the OS zero-filled +// a freshly created file) or is otherwise unusable. Recovery treats +// it as "no watermark" and seeds from the segment-derived value only. +// math.MinInt64 so max(watermark, lowestBase-1) always picks the +// segment seed in that case. +const qwpSfAckWatermarkInvalid int64 = math.MinInt64 + +// qwpSfAckWatermarkOpen opens (creating if absent) the watermark file +// in slotDir and maps its 16 bytes for the engine's lifetime. Returns +// nil on any setup failure (empty dir, open/allocate/mmap error) — the +// caller falls back to the no-watermark behaviour, no error escapes +// (the watermark is an optimisation, never a correctness dependency). +// +// An existing, correctly-sized file is opened read-write WITHOUT +// truncation so the previous session's (or another client's) FSN +// survives — defeating which is the whole point of the feature. +// A missing or wrong-sized file is (re)created at FILE_SIZE with zero +// magic, so the first read() reports INVALID until the first store. +func qwpSfAckWatermarkOpen(slotDir string) *qwpSfAckWatermark { + if slotDir == "" { + return nil + } + path := filepath.Join(slotDir, qwpSfAckWatermarkFileName) + st, statErr := os.Stat(path) + var ( + f *os.File + err error + ) + if statErr == nil && st.Size() == qwpSfAckWatermarkFileSize { + // Preserve the existing watermark bytes, but force a real disk + // block under the mapping. A foreign watermark may be sparse + // (truncated to 16 bytes but never block-allocated, or copied + // sparse); mmap'ing it and later storing through it from the + // manager goroutine would SIGBUS on a full disk when the page + // fault cannot back the hole. qwpSfAllocate would no-op here (it + // reserves only the newly-extended range, and the file is + // already full size), so instead read the bytes and write them + // straight back: the write allocates the block (the whole file + // fits one block) and surfaces ENOSPC here, at open, where it + // degrades to the no-watermark fallback — rather than faulting + // the manager. A non-sparse foreign file just rewrites in place. + f, err = os.OpenFile(path, os.O_RDWR, 0o644) + if err != nil { + return nil + } + var preserved [qwpSfAckWatermarkFileSize]byte + if _, err := io.ReadFull(f, preserved[:]); err != nil { + _ = f.Close() + return nil + } + if _, err := f.WriteAt(preserved[:], 0); err != nil { + _ = f.Close() + return nil + } + } else { + // Missing / wrong size: start clean and reserve a real disk + // block via the same allocate contract the segment create path + // uses, so a later store into the mapped region can't SIGBUS on + // a sparse hole. + f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + return nil + } + if allocErr := qwpSfAllocate(f, qwpSfAckWatermarkFileSize); allocErr != nil { + _ = f.Close() + return nil + } + } + buf, mmapErr := qwpSfMmapRW(f, qwpSfAckWatermarkFileSize) + if mmapErr != nil { + _ = f.Close() + return nil + } + magic := binary.LittleEndian.Uint32(buf[qwpSfAckWatermarkMagicOffset : qwpSfAckWatermarkMagicOffset+4]) + return &qwpSfAckWatermark{ + file: f, + buf: buf, + magicWritten: magic == qwpSfAckWatermarkMagic, + lastPersistedAck: -1, + } +} + +// qwpSfAckWatermarkRemoveOrphan best-effort removes a stale watermark +// file. Used by the engine when no segments are recovered (a fresh +// disk slot, or after a clean fully-drained shutdown) — a watermark +// with no segments behind it refers to a lifecycle now gone and would +// only confuse the next session's seed. No-op for memory mode. +func qwpSfAckWatermarkRemoveOrphan(slotDir string) { + if slotDir == "" { + return + } + _ = os.Remove(filepath.Join(slotDir, qwpSfAckWatermarkFileName)) +} + +// read returns the persisted FSN, or qwpSfAckWatermarkInvalid if the +// file has never been written (magic field zero) or has been closed. +// Called once at engine startup before the manager observes the entry. +func (w *qwpSfAckWatermark) read() int64 { + if w == nil { + return qwpSfAckWatermarkInvalid + } + w.mu.Lock() + defer w.mu.Unlock() + if w.closed { + return qwpSfAckWatermarkInvalid + } + magic := binary.LittleEndian.Uint32(w.buf[qwpSfAckWatermarkMagicOffset : qwpSfAckWatermarkMagicOffset+4]) + if magic != qwpSfAckWatermarkMagic { + // Freshly created (all zeros) or corrupt — fall back to the + // segment-derived seed. + return qwpSfAckWatermarkInvalid + } + return int64(binary.LittleEndian.Uint64(w.buf[qwpSfAckWatermarkFsnOffset : qwpSfAckWatermarkFsnOffset+8])) +} + +// storeLocked writes fsn into the mapped region. Caller MUST hold +// w.mu and have checked !w.closed. FSN is stored before the magic so +// that a reader which observes the magic (stamped second, in program +// order) also observes a valid FSN — no memory fence is needed +// because the same goroutine performs both stores and crash recovery +// resumes a fresh process that sees whatever the kernel flushed. +func (w *qwpSfAckWatermark) storeLocked(fsn int64) { + binary.LittleEndian.PutUint64(w.buf[qwpSfAckWatermarkFsnOffset:qwpSfAckWatermarkFsnOffset+8], uint64(fsn)) + if !w.magicWritten { + binary.LittleEndian.PutUint32(w.buf[qwpSfAckWatermarkMagicOffset:qwpSfAckWatermarkMagicOffset+4], qwpSfAckWatermarkMagic) + w.magicWritten = true + } +} + +// persistIfAdvanced stores fsn iff it advanced past the last value +// persisted this session, returning true if it wrote. The gate keeps +// the dirty-page footprint minimal under steady-state load with no +// new acks arriving. No-op after close. This is the segment manager's +// entry point, called once per maintenance tick BEFORE trim so the +// on-disk ordering recovery's max() clamp relies on holds across a +// crash in either order (sf-client.md §5.4). +func (w *qwpSfAckWatermark) persistIfAdvanced(fsn int64) bool { + if w == nil { + return false + } + w.mu.Lock() + defer w.mu.Unlock() + if w.closed || fsn <= w.lastPersistedAck { + return false + } + w.storeLocked(fsn) + w.lastPersistedAck = fsn + return true +} + +// close unmaps the region and closes the fd. Idempotent and +// safe to call concurrently with a manager-tick persistIfAdvanced: +// the mutex serialises them, and a store that loses the race observes +// closed==true and returns without touching the (now unmapped) buffer. +func (w *qwpSfAckWatermark) close() error { + if w == nil { + return nil + } + w.mu.Lock() + defer w.mu.Unlock() + if w.closed { + return nil + } + w.closed = true + var firstErr error + if w.buf != nil { + if err := qwpSfMunmap(w.buf); err != nil { + firstErr = err + } + w.buf = nil + } + if w.file != nil { + if err := w.file.Close(); err != nil && firstErr == nil { + firstErr = err + } + w.file = nil + } + return firstErr +} diff --git a/qwp_sf_ack_watermark_test.go b/qwp_sf_ack_watermark_test.go new file mode 100644 index 00000000..b1925507 --- /dev/null +++ b/qwp_sf_ack_watermark_test.go @@ -0,0 +1,295 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "encoding/binary" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// writeForeignAckWatermark hand-writes the 16 normative bytes a +// different client (e.g. the Java reference's AckWatermark.java) would +// leave on disk: magic 'AKW1' little-endian at offset 0, reserved 0 +// at offset 4, the FSN little-endian at offset 8. Used to prove the Go +// client honours a watermark it did not itself write (sf-client.md +// §19 interop). +func writeForeignAckWatermark(t *testing.T, slotDir string, fsn int64) { + t.Helper() + buf := make([]byte, qwpSfAckWatermarkFileSize) + binary.LittleEndian.PutUint32(buf[0:4], qwpSfAckWatermarkMagic) + // bytes[4:8] reserved == 0 + binary.LittleEndian.PutUint64(buf[8:16], uint64(fsn)) + path := filepath.Join(slotDir, qwpSfAckWatermarkFileName) + require.NoError(t, os.WriteFile(path, buf, 0o644)) +} + +func readAckWatermarkFileBytes(t *testing.T, slotDir string) []byte { + t.Helper() + b, err := os.ReadFile(filepath.Join(slotDir, qwpSfAckWatermarkFileName)) + require.NoError(t, err) + return b +} + +func TestQwpSfAckWatermarkFreshFileIsInvalid(t *testing.T) { + dir := t.TempDir() + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + defer func() { _ = w.close() }() + + assert.Equal(t, qwpSfAckWatermarkInvalid, w.read(), + "a freshly created (zero-filled) watermark must read INVALID") + + b := readAckWatermarkFileBytes(t, dir) + require.Len(t, b, int(qwpSfAckWatermarkFileSize)) + assert.Equal(t, make([]byte, qwpSfAckWatermarkFileSize), b, + "open() must not stamp anything until the first persist") +} + +func TestQwpSfAckWatermarkPersistGateAndFormat(t *testing.T) { + dir := t.TempDir() + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + + assert.True(t, w.persistIfAdvanced(7), "first advance writes") + assert.False(t, w.persistIfAdvanced(7), "same value does not re-write") + assert.False(t, w.persistIfAdvanced(3), "a regression never writes") + assert.True(t, w.persistIfAdvanced(9), "a higher value writes") + assert.Equal(t, int64(9), w.read()) + require.NoError(t, w.close()) + + // On-disk bytes must match the normative little-endian layout so a + // Java drainer can read them. + b := readAckWatermarkFileBytes(t, dir) + require.Len(t, b, 16) + assert.Equal(t, qwpSfAckWatermarkMagic, binary.LittleEndian.Uint32(b[0:4])) + assert.Equal(t, uint32(0), binary.LittleEndian.Uint32(b[4:8]), "reserved must be zero") + assert.Equal(t, int64(9), int64(binary.LittleEndian.Uint64(b[8:16]))) + + // Reopen preserves the value (magic already stamped). + w2 := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w2) + defer func() { _ = w2.close() }() + assert.Equal(t, int64(9), w2.read()) + // lastPersistedAck resets per session, but the gate still honours + // the on-disk value's monotonicity once we advance past it. + assert.False(t, w2.persistIfAdvanced(-1)) + assert.True(t, w2.persistIfAdvanced(10)) + assert.Equal(t, int64(10), w2.read()) +} + +func TestQwpSfAckWatermarkHonoursForeignBytes(t *testing.T) { + dir := t.TempDir() + writeForeignAckWatermark(t, dir, 42) + + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + defer func() { _ = w.close() }() + assert.Equal(t, int64(42), w.read(), + "a watermark written by another client must be read byte-for-byte") +} + +func TestQwpSfAckWatermarkBadMagicIsInvalid(t *testing.T) { + dir := t.TempDir() + buf := make([]byte, qwpSfAckWatermarkFileSize) + binary.LittleEndian.PutUint32(buf[0:4], 0xDEADBEEF) + binary.LittleEndian.PutUint64(buf[8:16], uint64(123)) + require.NoError(t, os.WriteFile(filepath.Join(dir, qwpSfAckWatermarkFileName), buf, 0o644)) + + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + defer func() { _ = w.close() }() + assert.Equal(t, qwpSfAckWatermarkInvalid, w.read(), + "a wrong-magic file must read INVALID so recovery falls back") +} + +func TestQwpSfAckWatermarkWrongSizeRecreated(t *testing.T) { + dir := t.TempDir() + // A truncated/garbage 4-byte file: mmapping its full 16 bytes would + // SIGBUS, so open() must recreate it at FILE_SIZE. + require.NoError(t, os.WriteFile(filepath.Join(dir, qwpSfAckWatermarkFileName), + []byte{1, 2, 3, 4}, 0o644)) + + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + defer func() { _ = w.close() }() + assert.Equal(t, qwpSfAckWatermarkInvalid, w.read()) + + st, err := os.Stat(filepath.Join(dir, qwpSfAckWatermarkFileName)) + require.NoError(t, err) + assert.Equal(t, qwpSfAckWatermarkFileSize, st.Size()) +} + +func TestQwpSfAckWatermarkClosedAndNilSafe(t *testing.T) { + dir := t.TempDir() + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + require.True(t, w.persistIfAdvanced(5)) + require.NoError(t, w.close()) + + assert.Equal(t, qwpSfAckWatermarkInvalid, w.read(), "read after close is INVALID") + assert.False(t, w.persistIfAdvanced(99), "persist after close is a no-op") + assert.NoError(t, w.close(), "close is idempotent") + + var nilW *qwpSfAckWatermark + assert.Equal(t, qwpSfAckWatermarkInvalid, nilW.read()) + assert.False(t, nilW.persistIfAdvanced(1)) + assert.NoError(t, nilW.close()) + + assert.Nil(t, qwpSfAckWatermarkOpen(""), "empty slot dir yields no watermark") +} + +func TestQwpSfAckWatermarkRemoveOrphan(t *testing.T) { + dir := t.TempDir() + w := qwpSfAckWatermarkOpen(dir) + require.NotNil(t, w) + require.True(t, w.persistIfAdvanced(1)) + require.NoError(t, w.close()) + + path := filepath.Join(dir, qwpSfAckWatermarkFileName) + _, err := os.Stat(path) + require.NoError(t, err) + + qwpSfAckWatermarkRemoveOrphan(dir) + _, err = os.Stat(path) + assert.True(t, os.IsNotExist(err), "removeOrphan must unlink the file") + + // Best-effort: must not panic on a missing file or empty dir. + qwpSfAckWatermarkRemoveOrphan(dir) + qwpSfAckWatermarkRemoveOrphan("") +} + +// TestQwpSfEngineRecoveryHonoursForeignWatermark is the regression +// test for the review: a Go engine (the same path a drainer uses to +// adopt an orphan slot) recovering a slot whose .ack-watermark was +// written by another client MUST seed ackedFsn from it, so replay +// resumes past the already-durable prefix instead of re-sending every +// frame in the lowest surviving segment (row-level duplicates against +// a still-alive server). +func TestQwpSfEngineRecoveryHonoursForeignWatermark(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + + // Session 1: write 6 frames, close with no acks. Files survive; + // the manager never advanced the watermark (no acks), so it is + // present but zero-magic. + { + e, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + for i := 0; i < 6; i++ { + _, err := e.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + require.Equal(t, int64(5), e.enginePublishedFsn()) + require.NoError(t, e.engineClose()) + } + + // A prior client (e.g. the Java reference) received cumulative + // durable acks through FSN 3 and persisted that watermark. + writeForeignAckWatermark(t, dir, 3) + + // Session 2 (== the drainer-adoption code path): the seed must be + // the watermark, not lowestBase-1 (= -1 here). + e2, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e2.engineClose() }() + assert.True(t, e2.engineWasRecoveredFromDisk()) + assert.Equal(t, int64(5), e2.enginePublishedFsn()) + assert.Equal(t, int64(3), e2.engineAckedFsn(), + "recovery must honour the foreign .ack-watermark; replay resumes at FSN 4") +} + +// TestQwpSfEngineRecoveryRejectsCorruptWatermark covers the +// sf-client.md §5.4 / §18.1 bound: a watermark above publishedFsn is +// corruption and MUST be ignored, falling back to the segment-derived +// seed so the un-acked tail still replays (no silent data loss). +func TestQwpSfEngineRecoveryRejectsCorruptWatermark(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + { + e, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + for i := 0; i < 4; i++ { + _, err := e.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + require.Equal(t, int64(3), e.enginePublishedFsn()) + require.NoError(t, e.engineClose()) + } + // Watermark FSN 99 >> publishedFsn 3 — bit-rot / torn write. + writeForeignAckWatermark(t, dir, 99) + + e2, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e2.engineClose() }() + assert.Equal(t, int64(3), e2.enginePublishedFsn()) + assert.Equal(t, int64(-1), e2.engineAckedFsn(), + "a watermark past publishedFsn must be rejected; tail still replays") +} + +// TestQwpSfEngineWatermarkPersistedByManager proves the write half: +// the segment manager persists ackedFsn so a later Go session (or a +// Go→Go drainer adoption) resumes past the durable prefix too. +func TestQwpSfEngineWatermarkPersistedByManager(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + { + e, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + for i := 0; i < 8; i++ { + _, err := e.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + // Ack a prefix only — the slot is NOT fully drained, so the + // files + watermark survive engineClose. + e.engineAcknowledge(4) + + // The manager polls on a ~1ms tick; wait for it to flush the + // watermark through to disk in the normative format. + require.Eventually(t, func() bool { + b, err := os.ReadFile(filepath.Join(dir, qwpSfAckWatermarkFileName)) + if err != nil || len(b) != 16 { + return false + } + return binary.LittleEndian.Uint32(b[0:4]) == qwpSfAckWatermarkMagic && + int64(binary.LittleEndian.Uint64(b[8:16])) == 4 + }, 2*time.Second, 5*time.Millisecond) + + require.NoError(t, e.engineClose()) + } + + e2, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e2.engineClose() }() + assert.Equal(t, int64(7), e2.enginePublishedFsn()) + assert.Equal(t, int64(4), e2.engineAckedFsn(), + "the manager-persisted watermark must seed the next session's ackedFsn") +} diff --git a/qwp_sf_allocate.go b/qwp_sf_allocate.go new file mode 100644 index 00000000..721c3c29 --- /dev/null +++ b/qwp_sf_allocate.go @@ -0,0 +1,107 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "fmt" + "os" +) + +// qwpSfReserveNewBlocksFn is the indirection qwpSfAllocate calls to +// reserve real disk blocks. In production it points at the +// platform-specific qwpSfReserveNewBlocks; tests swap it to fault-inject +// a reservation failure (e.g. ENOSPC) without having to actually fill a +// filesystem, then restore the original in a t.Cleanup. Mirrors the Java +// client's FilesFacade seam, where ENOSPC at allocate is fault-injected +// through a test facade (see MmapSegment.create's facade overload). +var qwpSfReserveNewBlocksFn = qwpSfReserveNewBlocks + +// qwpSfAllocate extends f to at least size bytes and reserves real +// disk blocks for the newly-extended range. Mirrors the Java client's +// Files.allocate contract (see java-questdb-client core/src/main/java +// /io/questdb/client/std/Files.java#allocate) so the two implementations +// agree on what an `allocate(fd, size)` call observably does. +// +// Cross-platform contract — identical observable behaviour on Linux, +// macOS, Windows, and the "other unix" stub for any caller that does +// not deliberately produce sparse files: +// +// 1. Never shrinks. Let currentSize be f's current logical size and +// target = max(size, currentSize). Requests where +// size <= currentSize short-circuit as a no-op success — f is +// left exactly as it was, no syscall reaches the kernel. +// 2. Reserves blocks for [currentSize, target). Pre-existing sparse +// holes inside [0, currentSize) are not retroactively filled +// (Linux and macOS anchor the reservation at currentSize; Windows' +// FileAllocationInfo is file-scope and will re-reserve the +// existing range too, but a caller relying on hole-filling is +// writing non-portable code). +// 3. Real errors surface as a wrapped error — notably ENOSPC, EFBIG, +// EIO (POSIX) or ERROR_DISK_FULL (Windows). The caller is +// responsible for closing the fd and unlinking the partial file. +// 4. Sparse fallback (Linux / macOS only). When the reservation +// primitive itself reports the filesystem doesn't support it +// (EOPNOTSUPP / EINVAL on Linux; EOPNOTSUPP / ENOTSUP on macOS), +// the call still extends the logical size via ftruncate but +// leaves blocks sparse — the SIGBUS risk re-emerges for that +// filesystem only. Windows has no equivalent fallback; any +// failure is fatal. +// +// Implementation split: this function owns the cross-platform +// invariants (fstat, target computation, short-circuit, post-reserve +// ftruncate). The platform-specific qwpSfReserveNewBlocks owns the +// single concern of "reserve real disk blocks for [currentSize, +// currentSize+newBytes)" on its OS. +func qwpSfAllocate(f *os.File, size int64) error { + st, err := f.Stat() + if err != nil { + return fmt.Errorf("qwp/sf: stat %s: %w", f.Name(), err) + } + currentSize := st.Size() + target := size + if currentSize > target { + target = currentSize + } + if target == currentSize { + // Never-shrinks short-circuit: nothing to extend, nothing to + // reserve. Returning here is what makes the property hold — + // without it the ftruncate below would shrink files when + // size < currentSize. + return nil + } + newBytes := target - currentSize + if err := qwpSfReserveNewBlocksFn(f, currentSize, newBytes); err != nil { + return err + } + // Unified EOF advancement. On Linux when fallocate succeeded the + // file is already at target and this is a no-op; on the Linux + // sparse-fallback path and on macOS / Windows it is the call that + // grows the file. Never shrinks because target > currentSize by + // the time we reach here (the short-circuit above covered equal). + if err := f.Truncate(target); err != nil { + return fmt.Errorf("qwp/sf: truncate %s to %d bytes: %w", f.Name(), target, err) + } + return nil +} diff --git a/qwp_sf_allocate_test.go b/qwp_sf_allocate_test.go new file mode 100644 index 00000000..969852c0 --- /dev/null +++ b/qwp_sf_allocate_test.go @@ -0,0 +1,155 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestQwpSfAllocateNeverShrinks pins the cross-platform contract +// documented on qwpSfAllocate: never shrinks, short-circuits on +// size <= currentSize, extends on size > currentSize. Mirrors the +// Java client's testAllocateNeverShrinks (FilesTest) so the two +// implementations stay in lockstep. +func TestQwpSfAllocateNeverShrinks(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "allocate-shrink.bin") + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + require.NoError(t, err) + defer func() { _ = f.Close() }() + + requireSize := func(want int64) { + t.Helper() + st, err := f.Stat() + require.NoError(t, err) + assert.Equal(t, want, st.Size()) + } + + // Grow to 64 KiB. + require.NoError(t, qwpSfAllocate(f, 64*1024)) + requireSize(64 * 1024) + + // Smaller request: must not shrink the file. + require.NoError(t, qwpSfAllocate(f, 4096)) + requireSize(64 * 1024) + + // Equal request: no-op success, size unchanged. + require.NoError(t, qwpSfAllocate(f, 64*1024)) + requireSize(64 * 1024) + + // Larger request: extends to the new target. + require.NoError(t, qwpSfAllocate(f, 128*1024)) + requireSize(128 * 1024) +} + +// TestQwpSfAllocateZeroOnFreshFile exercises the no-op short-circuit +// on a brand-new (size=0) file — no reservation syscall should reach +// the kernel, the file stays at size 0. +func TestQwpSfAllocateZeroOnFreshFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "allocate-zero.bin") + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + require.NoError(t, err) + defer func() { _ = f.Close() }() + + require.NoError(t, qwpSfAllocate(f, 0)) + st, err := f.Stat() + require.NoError(t, err) + assert.Equal(t, int64(0), st.Size()) +} + +// withInjectedReserveFailure swaps the block-reservation primitive for +// one that always fails with a wrapped ENOSPC, and restores the original +// on cleanup. Lets the durability-layer ENOSPC tests run without having +// to actually fill a filesystem. Tests run sequentially within a package +// so the package-level swap is race-free. +func withInjectedReserveFailure(t *testing.T) { + t.Helper() + orig := qwpSfReserveNewBlocksFn + t.Cleanup(func() { qwpSfReserveNewBlocksFn = orig }) + qwpSfReserveNewBlocksFn = func(_ *os.File, _, _ int64) error { + return fmt.Errorf("qwp/sf: fallocate fault-injected: %w", syscall.ENOSPC) + } +} + +// TestQwpSfAllocateSurfacesReserveFailure pins item 3 of qwpSfAllocate's +// cross-platform contract: a real reservation failure (ENOSPC, EFBIG, +// EIO) surfaces as an error and the file is NOT extended. There is no +// silent sparse fallback for those errnos — that path is reserved for +// "filesystem cannot reserve" (EOPNOTSUPP/EINVAL), which the platform +// helper absorbs internally. A sparse extension here would defer ENOSPC +// to an mmap-store SIGBUS that tears down the whole process. +func TestQwpSfAllocateSurfacesReserveFailure(t *testing.T) { + withInjectedReserveFailure(t) + + dir := t.TempDir() + path := filepath.Join(dir, "enospc.bin") + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + require.NoError(t, err) + defer func() { _ = f.Close() }() + + err = qwpSfAllocate(f, 64*1024) + require.Error(t, err, "reserve failure must surface, not silently fall back to sparse") + assert.ErrorIs(t, err, syscall.ENOSPC) + + // The post-reserve ftruncate that advances EOF is only reached on + // reservation success, so a failed reservation must leave the file at + // its pre-call size (0). That is exactly what prevents a + // logically-sized-but-sparse mapping. + st, statErr := f.Stat() + require.NoError(t, statErr) + assert.Equal(t, int64(0), st.Size(), + "a failed reservation must not extend the file (no sparse mapping)") +} + +// TestQwpSfCreateSegmentRemovesPartialFileOnReserveFailure pins the +// create-path cleanup contract: when pre-allocation fails (ENOSPC), +// qwpSfCreateSegment returns the error AND unlinks the partially-created +// file, so a sustained disk-full burst with the segment manager polling +// does not litter the slot directory with full-size empty .sfa files. +// Mirrors the Java MmapSegment.create() ff.remove() on allocate failure. +func TestQwpSfCreateSegmentRemovesPartialFileOnReserveFailure(t *testing.T) { + withInjectedReserveFailure(t) + + dir := t.TempDir() + path := filepath.Join(dir, "sf-initial.sfa") + + seg, err := qwpSfCreateSegment(path, 0, 256*1024) + require.Error(t, err, "create must fail when pre-allocation fails") + assert.Nil(t, seg) + assert.ErrorIs(t, err, syscall.ENOSPC) + + _, statErr := os.Stat(path) + assert.Truef(t, os.IsNotExist(statErr), + "the partially-created segment file must be unlinked on pre-allocation "+ + "failure; stat err = %v", statErr) +} diff --git a/qwp_sf_classify.go b/qwp_sf_classify.go new file mode 100644 index 00000000..7df4ed03 --- /dev/null +++ b/qwp_sf_classify.go @@ -0,0 +1,175 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "log" + + "github.com/coder/websocket" +) + +// qwpSfClassify maps a QWP server response status byte to a Category. +// Wire codes are 1:1 with the categories the server distinguishes; +// unknown bytes fall through to CategoryUnknown so the client never +// silently drops a rejection it cannot interpret. +// +// Mirror of Java CursorWebSocketSendLoop.classify (always keep these +// two in sync — categories are part of the public cross-language +// surface). +func qwpSfClassify(status QwpStatusCode) Category { + switch status { + case QwpStatusSchemaMismatch: + return CategorySchemaMismatch + case QwpStatusParseError: + return CategoryParseError + case QwpStatusInternalError: + return CategoryInternalError + case QwpStatusSecurityError: + return CategorySecurityError + case QwpStatusWriteError: + return CategoryWriteError + default: + return CategoryUnknown + } +} + +// qwpSfDefaultPolicyFor is the spec default Policy for each Category, +// used when the user has not overridden the slot via builder option or +// connect-string. CategoryProtocolViolation and CategoryUnknown are +// forced HALT and the resolver enforces this independent of user +// overrides. +// +// Reasoning per the spec § "Default category → policy": +// - SchemaMismatch / WriteError → DropAndContinue: replay reproduces +// the same rejection; halting blocks unrelated tables on the same +// connection. +// - ParseError → Halt: almost certainly a client bug; halt preserves +// the on-disk frames for postmortem. +// - InternalError / SecurityError → Halt: catch-all server fault or +// misconfig; loud failure wanted, no retryable bit available. +// - ProtocolViolation / Unknown → Halt: connection is gone or the +// status byte is not one we can interpret — never silently drop. +func qwpSfDefaultPolicyFor(c Category) Policy { + switch c { + case CategorySchemaMismatch, CategoryWriteError: + return PolicyDropAndContinue + case CategoryParseError, CategoryInternalError, CategorySecurityError: + return PolicyHalt + case CategoryProtocolViolation, CategoryUnknown: + return PolicyHalt + default: + return PolicyHalt + } +} + +// qwpSfIsTerminalCloseCode reports whether a WebSocket close code is +// terminal — replaying the same bytes will produce the same close, so +// reconnect cannot fix it. Translates to CategoryProtocolViolation. +// +// Reserved codes 1004/1005/1006/1015 are deliberately not classified +// terminal: when they arrive in practice they signal abnormal +// disconnect rather than the server's reasoned rejection of payload +// bytes, so reconnect is the right reaction. +// +// Mirror of Java CursorWebSocketSendLoop.isTerminalCloseCode. +func qwpSfIsTerminalCloseCode(code websocket.StatusCode) bool { + switch code { + case websocket.StatusProtocolError, + websocket.StatusUnsupportedData, + websocket.StatusInvalidFramePayloadData, + websocket.StatusPolicyViolation, + websocket.StatusMessageTooBig, + websocket.StatusMandatoryExtension: + return true + default: + return false + } +} + +// qwpSfPolicyResolver composes the precedence chain for resolving a +// Category to a concrete Policy: +// +// 1. resolver (programmatic, full control via WithErrorPolicyResolver) +// 2. perCat[c] (builder WithErrorPolicy or connect-string on_*_error) +// 3. global (connect-string on_server_error) +// 4. spec default (qwpSfDefaultPolicyFor) +// +// CategoryProtocolViolation and CategoryUnknown bypass user overrides +// and always resolve to PolicyHalt — silently ignoring user-set +// non-Halt slots for those two categories. +type qwpSfPolicyResolver struct { + resolver func(Category) Policy + perCat [numCategories]Policy + global Policy +} + +// callResolver invokes the user-supplied resolver under a panic guard. +// The resolver runs on the receiver goroutine, so a panicking +// WithErrorPolicyResolver callback would otherwise crash the host. A +// panic is treated as a user bug: recover, log, and fall back to the +// spec default for the category. That default is always a concrete +// Halt / DropAndContinue (never PolicyAuto), so resolve's +// `!= PolicyAuto` check short-circuits the rest of the precedence chain +// — a broken resolver yields the safe spec policy rather than silently +// deferring to lower-precedence slots. A clean return is propagated +// verbatim, including PolicyAuto, which lets resolve fall through. +// +// Mirrors the handler panic guard in qwpSfErrorDispatcher.deliver. +func (r *qwpSfPolicyResolver) callResolver(c Category) (pol Policy) { + defer func() { + if rec := recover(); rec != nil { + log.Printf("[ERROR] qwp/sf: error policy resolver panicked on category %s: %v", c, rec) + pol = qwpSfDefaultPolicyFor(c) + } + }() + return r.resolver(c) +} + +// resolve returns the Policy to apply for the given Category. +// PolicyAuto is never returned — every category resolves to a concrete +// Halt or DropAndContinue choice. +func (r *qwpSfPolicyResolver) resolve(c Category) Policy { + // Forced HALT for unknown / protocol-violation regardless of user + // configuration — silence forbidden, no DROP for the unintelligible. + if c == CategoryProtocolViolation || c == CategoryUnknown { + return PolicyHalt + } + if r != nil { + if r.resolver != nil { + if p := r.callResolver(c); p != PolicyAuto { + return p + } + } + if int(c) < len(r.perCat) { + if p := r.perCat[c]; p != PolicyAuto { + return p + } + } + if r.global != PolicyAuto { + return r.global + } + } + return qwpSfDefaultPolicyFor(c) +} diff --git a/qwp_sf_classify_test.go b/qwp_sf_classify_test.go new file mode 100644 index 00000000..6e2274cc --- /dev/null +++ b/qwp_sf_classify_test.go @@ -0,0 +1,204 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "testing" + + "github.com/coder/websocket" +) + +func TestQwpSfClassify(t *testing.T) { + tests := []struct { + status QwpStatusCode + want Category + }{ + {QwpStatusSchemaMismatch, CategorySchemaMismatch}, + {QwpStatusParseError, CategoryParseError}, + {QwpStatusInternalError, CategoryInternalError}, + {QwpStatusSecurityError, CategorySecurityError}, + {QwpStatusWriteError, CategoryWriteError}, + // OK / DurableAck never reach classify in production but they + // fall through to Unknown defensively. + {QwpStatusOK, CategoryUnknown}, + {QwpStatusDurableAck, CategoryUnknown}, + // Forward-compat: server adds a new status byte we do not + // understand. + {QwpStatusCode(0xFE), CategoryUnknown}, + } + for _, tc := range tests { + if got := qwpSfClassify(tc.status); got != tc.want { + t.Errorf("qwpSfClassify(0x%02X) = %s, want %s", + byte(tc.status), got, tc.want) + } + } +} + +func TestQwpSfDefaultPolicyFor(t *testing.T) { + tests := []struct { + c Category + want Policy + }{ + {CategorySchemaMismatch, PolicyDropAndContinue}, + {CategoryWriteError, PolicyDropAndContinue}, + {CategoryParseError, PolicyHalt}, + {CategoryInternalError, PolicyHalt}, + {CategorySecurityError, PolicyHalt}, + {CategoryProtocolViolation, PolicyHalt}, + {CategoryUnknown, PolicyHalt}, + } + for _, tc := range tests { + if got := qwpSfDefaultPolicyFor(tc.c); got != tc.want { + t.Errorf("qwpSfDefaultPolicyFor(%s) = %s, want %s", + tc.c, got, tc.want) + } + } +} + +func TestQwpSfIsTerminalCloseCode(t *testing.T) { + tests := []struct { + code websocket.StatusCode + want bool + name string + }{ + {websocket.StatusProtocolError, true, "PROTOCOL_ERROR"}, + {websocket.StatusUnsupportedData, true, "UNSUPPORTED_DATA"}, + {websocket.StatusInvalidFramePayloadData, true, "INVALID_PAYLOAD_DATA"}, + {websocket.StatusPolicyViolation, true, "POLICY_VIOLATION"}, + {websocket.StatusMessageTooBig, true, "MESSAGE_TOO_BIG"}, + {websocket.StatusMandatoryExtension, true, "MANDATORY_EXTENSION"}, + {websocket.StatusNormalClosure, false, "NormalClosure"}, + {websocket.StatusGoingAway, false, "GoingAway"}, + {websocket.StatusAbnormalClosure, false, "AbnormalClosure"}, + {websocket.StatusInternalError, false, "InternalError"}, + {websocket.StatusCode(-1), false, "non-CloseError sentinel"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if got := qwpSfIsTerminalCloseCode(tc.code); got != tc.want { + t.Errorf("qwpSfIsTerminalCloseCode(%d) = %v, want %v", + tc.code, got, tc.want) + } + }) + } +} + +// TestQwpSfPolicyResolverPrecedence checks the four-layer override +// stack: programmatic resolver > per-category map > global default > +// spec default. ProtocolViolation and Unknown ignore overrides. +func TestQwpSfPolicyResolverPrecedence(t *testing.T) { + t.Run("nil resolver falls through to spec defaults", func(t *testing.T) { + var r *qwpSfPolicyResolver + if got := r.resolve(CategorySchemaMismatch); got != PolicyDropAndContinue { + t.Errorf("nil resolver SchemaMismatch = %s, want DropAndContinue", got) + } + }) + + t.Run("zero resolver falls through to spec defaults", func(t *testing.T) { + r := &qwpSfPolicyResolver{} + if got := r.resolve(CategoryParseError); got != PolicyHalt { + t.Errorf("zero resolver ParseError = %s, want Halt", got) + } + }) + + t.Run("global override beats spec default", func(t *testing.T) { + r := &qwpSfPolicyResolver{global: PolicyHalt} + if got := r.resolve(CategorySchemaMismatch); got != PolicyHalt { + t.Errorf("global=Halt SchemaMismatch = %s, want Halt", got) + } + }) + + t.Run("per-category beats global", func(t *testing.T) { + r := &qwpSfPolicyResolver{global: PolicyHalt} + r.perCat[CategorySchemaMismatch] = PolicyDropAndContinue + if got := r.resolve(CategorySchemaMismatch); got != PolicyDropAndContinue { + t.Errorf("per-cat beats global = %s, want DropAndContinue", got) + } + }) + + t.Run("programmatic resolver beats per-category", func(t *testing.T) { + r := &qwpSfPolicyResolver{} + r.perCat[CategoryParseError] = PolicyDropAndContinue + r.resolver = func(c Category) Policy { + if c == CategoryParseError { + return PolicyHalt + } + return PolicyAuto + } + if got := r.resolve(CategoryParseError); got != PolicyHalt { + t.Errorf("programmatic beats per-cat = %s, want Halt", got) + } + }) + + t.Run("programmatic resolver returning Auto falls through", func(t *testing.T) { + r := &qwpSfPolicyResolver{} + r.perCat[CategoryWriteError] = PolicyHalt + r.resolver = func(Category) Policy { return PolicyAuto } + if got := r.resolve(CategoryWriteError); got != PolicyHalt { + t.Errorf("programmatic Auto + per-cat=Halt = %s, want Halt", got) + } + }) + + t.Run("panicking resolver falls back to spec default", func(t *testing.T) { + // A per-category override is set, but the panic short-circuits + // to the spec default rather than falling through to it: a + // broken resolver must not silently defer to lower-precedence + // slots. SchemaMismatch's spec default is DropAndContinue. + r := &qwpSfPolicyResolver{} + r.perCat[CategorySchemaMismatch] = PolicyHalt + r.resolver = func(Category) Policy { panic("boom") } + if got := r.resolve(CategorySchemaMismatch); got != PolicyDropAndContinue { + t.Errorf("panicking resolver SchemaMismatch = %s, want DropAndContinue (spec default)", got) + } + }) + + t.Run("panicking resolver does not crash the caller", func(t *testing.T) { + // The receiver goroutine invokes resolve directly; a panic that + // escapes would take down the host process. ParseError's spec + // default is Halt. + r := &qwpSfPolicyResolver{} + r.resolver = func(Category) Policy { panic("boom") } + if got := r.resolve(CategoryParseError); got != PolicyHalt { + t.Errorf("panicking resolver ParseError = %s, want Halt (spec default)", got) + } + }) + + t.Run("ProtocolViolation forced Halt regardless", func(t *testing.T) { + r := &qwpSfPolicyResolver{global: PolicyDropAndContinue} + r.perCat[CategoryProtocolViolation] = PolicyDropAndContinue + r.resolver = func(Category) Policy { return PolicyDropAndContinue } + if got := r.resolve(CategoryProtocolViolation); got != PolicyHalt { + t.Errorf("ProtocolViolation = %s, want Halt (forced)", got) + } + }) + + t.Run("Unknown forced Halt regardless", func(t *testing.T) { + r := &qwpSfPolicyResolver{global: PolicyDropAndContinue} + r.perCat[CategoryUnknown] = PolicyDropAndContinue + if got := r.resolve(CategoryUnknown); got != PolicyHalt { + t.Errorf("Unknown = %s, want Halt (forced)", got) + } + }) +} diff --git a/qwp_sf_close_frame_test.go b/qwp_sf_close_frame_test.go new file mode 100644 index 00000000..192785ae --- /dev/null +++ b/qwp_sf_close_frame_test.go @@ -0,0 +1,265 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// closeFrameTestServer accepts the WS upgrade, reads one frame, then +// closes the connection with the configured terminal close code. +func closeFrameTestServer(t *testing.T, code websocket.StatusCode, reason string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + _, _, _ = conn.Read(context.Background()) + _ = conn.Close(code, reason) + })) +} + +// closeAfterNFramesServer accepts the WS upgrade, reads exactly n +// frames (never ACKing any), then closes with the given terminal +// code. Consuming every frame the producer sends before closing +// keeps senderLoop from producing a write error that would race the +// receiver's close-frame error in runOneConnection's first-error +// aggregation — so the resulting terminal SenderError is always the +// close-code one, with a deterministic [ackedFsn+1, publishedFsn] +// FSN span. +func closeAfterNFramesServer(t *testing.T, n int, code websocket.StatusCode, reason string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + for i := 0; i < n; i++ { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + _ = conn.Close(code, reason) + })) +} + +// TestQwpSfTerminalCloseCodeProducesProtocolViolation drives the send +// loop against a server that closes with each terminal code; asserts +// the loop produces a CategoryProtocolViolation+Halt SenderError and +// does not enter reconnect. +func TestQwpSfTerminalCloseCodeProducesProtocolViolation(t *testing.T) { + codes := []struct { + code websocket.StatusCode + reason string + }{ + {websocket.StatusProtocolError, "bad framing"}, + {websocket.StatusUnsupportedData, "frame type unsupported"}, + {websocket.StatusInvalidFramePayloadData, "bad payload"}, + {websocket.StatusPolicyViolation, "policy reject"}, + {websocket.StatusMessageTooBig, "frame oversized"}, + {websocket.StatusMandatoryExtension, "extension required"}, + } + for _, c := range codes { + t.Run(c.code.String(), func(t *testing.T) { + httpSrv := closeFrameTestServer(t, c.code, c.reason) + defer httpSrv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + factory := qwpSfDialAt(httpSrv.URL) + transport, err := factory(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, factory, + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("frame")) + require.NoError(t, err) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 3*time.Second, 1*time.Millisecond, + "loop did not record terminal error for close code %d", c.code) + + gotErr := loop.sendLoopCheckError() + var senderErr *SenderError + require.True(t, errors.As(gotErr, &senderErr), + "expected *SenderError, got %T: %v", gotErr, gotErr) + assert.Equal(t, CategoryProtocolViolation, senderErr.Category) + assert.Equal(t, PolicyHalt, senderErr.AppliedPolicy) + assert.Equal(t, NoStatusByte, senderErr.ServerStatusByte) + assert.Contains(t, senderErr.ServerMessage, "ws-close[") + + // The loop did not enter reconnect — the close code is + // terminal. Reconnect counter stays at zero. + assert.Equal(t, int64(0), loop.sendLoopTotalReconnects()) + }) + } +} + +// TestQwpSfTerminalCloseMultiFrameFsnSpan pins the non-degenerate +// SenderError FSN span. Every other terminal-path test publishes a +// single unacked frame, so FromFsn == ToFsn and the span is never +// actually exercised. Here several frames are published and none are +// ACKed when a terminal close arrives, so qwpSfBuildProtocolViolationSE +// must report [FromFsn, ToFsn] = [ackedFsn+1, publishedFsn] with +// FromFsn strictly < ToFsn — the multi-frame correlation window that +// dead-lettering and AwaitAckedFsn callers rely on. +func TestQwpSfTerminalCloseMultiFrameFsnSpan(t *testing.T) { + const nFrames = 4 + httpSrv := closeAfterNFramesServer(t, nFrames, + websocket.StatusProtocolError, "bad framing") + defer httpSrv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + // Publish every frame BEFORE the loop starts: publishedFsn is then + // a stable nFrames-1 by the time the close-frame SE is built, and + // the server reads exactly the nFrames the producer will send. + for i := 0; i < nFrames; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + require.Equal(t, int64(nFrames-1), engine.enginePublishedFsn()) + require.Equal(t, int64(-1), engine.engineAckedFsn(), + "precondition: nothing ACKed, so FromFsn must come out as 0") + + factory := qwpSfDialAt(httpSrv.URL) + transport, err := factory(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, factory, + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 3*time.Second, 1*time.Millisecond, + "loop did not record the terminal close error") + + var se *SenderError + require.True(t, errors.As(loop.sendLoopCheckError(), &se), + "expected *SenderError, got %v", loop.sendLoopCheckError()) + assert.Equal(t, CategoryProtocolViolation, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + assert.Contains(t, se.ServerMessage, "ws-close[") + // The point of the test: a real multi-frame span. + assert.Equal(t, int64(0), se.FromFsn, + "FromFsn = ackedFsn+1 = 0 (nothing ACKed)") + assert.Equal(t, int64(nFrames-1), se.ToFsn, + "ToFsn = publishedFsn = nFrames-1") + assert.Less(t, se.FromFsn, se.ToFsn, + "multi-frame span: FromFsn must be strictly < ToFsn (not the "+ + "degenerate single-frame FromFsn == ToFsn case)") + assert.Equal(t, int64(0), loop.sendLoopTotalReconnects(), + "terminal close must not trigger reconnect") +} + +// Non-terminal close-code reconnect is already covered by +// TestQwpSfSendLoopReconnectAfterServerClose at qwp_sf_send_loop_test.go; +// no need to duplicate here. The point of this file is the new +// terminal-close-code path. + +// runUpgradeFailureScenario drives the send loop against an +// initially-working server that ACKs frame 1 and drops on frame 2, +// with reconnect pointing at a server that rejects the upgrade with +// the given HTTP status. Returns the latched terminal SenderError. +func runUpgradeFailureScenario(t *testing.T, upgradeStatus int) *SenderError { + t.Helper() + failSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{upgradeStatus: upgradeStatus}) + t.Cleanup(failSrv.Close) + + // Data server ACKs the first frame and closes on the second: + // frame 1 advances totalAcks, so the silent-drop guard (which + // is gated on totalAcks == 0) won't fire when the connection + // breaks. + dataSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{closeAfterFrames: 2}) + t.Cleanup(dataSrv.Close) + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + t.Cleanup(func() { _ = engine.engineClose() }) + + transport, err := qwpSfDialFor(dataSrv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(failSrv), + 100*time.Microsecond, 200*time.Millisecond, 10*time.Millisecond, 50*time.Millisecond) + loop.sendLoopStart() + t.Cleanup(func() { _ = loop.sendLoopClose() }) + + for i := 0; i < 2; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + + require.Eventually(t, func() bool { + return loop.sendLoopLastTerminalServerError() != nil + }, 3*time.Second, 1*time.Millisecond, + "loop did not record terminal SenderError for upgrade %d", upgradeStatus) + + se := loop.sendLoopLastTerminalServerError() + require.NotNil(t, se) + return se +} + +// TestQwpSfAuthFailureProducesSecurityError: 401 (auth) → +// CategorySecurityError. +func TestQwpSfAuthFailureProducesSecurityError(t *testing.T) { + se := runUpgradeFailureScenario(t, 401) + assert.Equal(t, CategorySecurityError, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) + assert.Equal(t, NoStatusByte, se.ServerStatusByte) + assert.True(t, strings.Contains(se.ServerMessage, "ws-upgrade-failed"), + "expected ws-upgrade-failed in message, got %q", se.ServerMessage) +} + +// TestQwpSfProtocolUpgradeFailureProducesProtocolViolation: 426 +// (Upgrade Required) → CategoryProtocolViolation, not SecurityError. +func TestQwpSfProtocolUpgradeFailureProducesProtocolViolation(t *testing.T) { + se := runUpgradeFailureScenario(t, 426) + assert.Equal(t, CategoryProtocolViolation, se.Category) + assert.Equal(t, PolicyHalt, se.AppliedPolicy) +} diff --git a/qwp_sf_conf_test.go b/qwp_sf_conf_test.go new file mode 100644 index 00000000..f086390e --- /dev/null +++ b/qwp_sf_conf_test.go @@ -0,0 +1,750 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "fmt" + "net" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSfConfParseAcceptsAllKnobs(t *testing.T) { + conf, err := confFromStr(strings.Join([]string{ + "ws::addr=localhost:9000", + "sf_dir=/tmp/sf", + "sender_id=my-sender", + "sf_max_bytes=8388608", + "sf_max_total_bytes=21474836480", + "sf_durability=memory", + "sf_append_deadline_millis=20000", + "reconnect_max_duration_millis=120000", + "reconnect_initial_backoff_millis=200", + "reconnect_max_backoff_millis=10000", + "initial_connect_retry=on", + "close_flush_timeout_millis=2500", + "drain_orphans=on", + "max_background_drainers=2", + "request_durable_ack=off", + "durable_ack_keepalive_interval_millis=200;", + }, ";")) + require.NoError(t, err) + assert.Equal(t, "/tmp/sf", conf.sfDir) + assert.Equal(t, "my-sender", conf.senderId) + assert.Equal(t, int64(8388608), conf.sfMaxBytes) + assert.Equal(t, int64(21474836480), conf.sfMaxTotalBytes) + assert.Equal(t, "memory", conf.sfDurability) + assert.Equal(t, 20000, conf.sfAppendDeadlineMillis) + assert.Equal(t, 120000, conf.reconnectMaxDurationMillis) + assert.Equal(t, 200, conf.reconnectInitialBackoffMillis) + assert.Equal(t, 10000, conf.reconnectMaxBackoffMillis) + assert.Equal(t, InitialConnectSync, conf.initialConnectMode) + assert.Equal(t, 2500, conf.closeFlushTimeoutMillis) + assert.True(t, conf.closeFlushTimeoutSet) + assert.True(t, conf.drainOrphans) + assert.Equal(t, 2, conf.maxBackgroundDrainers) +} + +func TestSfConfRejectsNonQwpSchema(t *testing.T) { + for _, schema := range []string{"http", "https", "tcp", "tcps"} { + t.Run(schema, func(t *testing.T) { + _, err := confFromStr(schema + "::addr=localhost:9000;sf_dir=/tmp/sf;") + require.Error(t, err) + assert.Contains(t, err.Error(), "QWP") + }) + } +} + +func TestSfConfRejectsBadSenderId(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sender_id=bad/id;") + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid character") +} + +func TestSfConfRejectsBadDurability(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sf_durability=bogus;") + require.Error(t, err) + assert.Contains(t, err.Error(), "memory") +} + +func TestSfConfRejectsDeferredDurabilityModes(t *testing.T) { + for _, v := range []string{"flush", "append"} { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sf_durability=" + v + ";") + require.Error(t, err) + assert.Contains(t, err.Error(), "deferred") + } +} + +// WithSfDurability is the functional-option analogue of the +// sf_durability connect-string key. The parser rejects flush/append +// and bogus values up front; the option path is a thin setter, so the +// equivalent gate lives in sanitizeQwpConf via the shared +// validateSfDurability helper. These tests pin that parity (SSOT for +// the value space) — see TestSfConfRejectsDeferredDurabilityModes / +// TestSfConfRejectsBadDurability for the connect-string side. +func TestSfDurabilityOptionRejectsDeferredModes(t *testing.T) { + for _, v := range []string{"flush", "append"} { + t.Run(v, func(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithSfDurability(v)(conf) + err := sanitizeQwpConf(conf) + require.Error(t, err) + assert.Contains(t, err.Error(), "deferred") + }) + } +} + +func TestSfDurabilityOptionRejectsBogus(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithSfDurability("bogus")(conf) + err := sanitizeQwpConf(conf) + require.Error(t, err) + assert.Contains(t, err.Error(), "memory") +} + +func TestSfDurabilityOptionMemoryAccepted(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithSfDurability("memory")(conf) + require.NoError(t, sanitizeQwpConf(conf)) +} + +// WithSenderId is the functional-option analogue of the sender_id +// connect-string key. The parser rejects '.', '/', '\' and other +// out-of-charset bytes (TestSfConfRejectsBadSenderId pins that), but +// the option path used to assign the raw string straight to +// conf.senderId. The unsanitized value is then joined into the slot +// path under sfDir, so values like "../etc" would let a caller +// escape the sf_dir root. sanitizeQwpConf must apply the same charset +// gate the parser does — these tests pin parity. +func TestSenderIdOptionRejectsPathTraversal(t *testing.T) { + for _, id := range []string{"../etc", "..", "a/b", `a\b`, "foo.bar"} { + t.Run(id, func(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithSenderId(id)(conf) + err := sanitizeQwpConf(conf) + require.Error(t, err) + assert.Contains(t, err.Error(), "sender_id") + }) + } +} + +func TestSenderIdOptionAcceptsValid(t *testing.T) { + for _, id := range []string{"default", "ingest-1", "slot_42", "ABCxyz"} { + t.Run(id, func(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithSenderId(id)(conf) + require.NoError(t, sanitizeQwpConf(conf)) + }) + } +} + +// Durable-ack mode is a deferred opt-in feature, but sf-client.md §19 +// makes its connect-string keys normative: the parser MUST recognise +// request_durable_ack / durable_ack_keepalive_interval_millis so a +// user porting a Java connect string gets a clear deferred-feature +// message, not the generic "unsupported option". +func TestSfConfDurableAckOffParses(t *testing.T) { + for _, v := range []string{"off", "false"} { + t.Run(v, func(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;request_durable_ack=" + v + ";") + require.NoError(t, err) + }) + } +} + +func TestSfConfRejectsDurableAckOptIn(t *testing.T) { + for _, v := range []string{"on", "true"} { + t.Run(v, func(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;request_durable_ack=" + v + ";") + require.Error(t, err) + // Must name the feature and that it is deferred -- not the + // generic "unsupported option" the review flagged. + assert.Contains(t, err.Error(), "not implemented") + assert.Contains(t, err.Error(), "deferred") + assert.NotContains(t, err.Error(), "unsupported option") + }) + } +} + +func TestSfConfRejectsBadDurableAckValue(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;request_durable_ack=maybe;") + require.Error(t, err) + for _, want := range []string{"on", "off", "true", "false"} { + assert.Contains(t, err.Error(), want) + } +} + +func TestSfConfRejectsDurableAckKeysOnNonQwp(t *testing.T) { + cases := []string{ + "request_durable_ack=off", + "durable_ack_keepalive_interval_millis=200", + } + for _, schema := range []string{"http", "tcp"} { + for _, c := range cases { + t.Run(schema+"/"+c, func(t *testing.T) { + _, err := confFromStr(schema + "::addr=localhost:9000;" + c + ";") + require.Error(t, err) + assert.Contains(t, err.Error(), "QWP") + }) + } + } +} + +func TestSfConfDurableAckKeepaliveParses(t *testing.T) { + // 0 and negative mean "disabled" per sf-client.md §4.3, so any + // int is in range; only a non-int is rejected. + for _, v := range []string{"200", "0", "-1"} { + t.Run(v, func(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;durable_ack_keepalive_interval_millis=" + v + ";") + require.NoError(t, err) + }) + } + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;durable_ack_keepalive_interval_millis=soon;") + require.Error(t, err) + assert.Contains(t, err.Error(), "int") +} + +func TestSfConfRejectsNegativeNumbers(t *testing.T) { + cases := []string{ + "sf_max_bytes=-1", + "sf_max_total_bytes=-1", + "sf_append_deadline_millis=0", + "reconnect_initial_backoff_millis=0", + "reconnect_max_backoff_millis=0", + "max_background_drainers=-1", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;" + c + ";") + require.Error(t, err) + }) + } +} + +// TestSfConfRejectsAutoFlushBytesAboveSfMaxBytes pins the sanitize-time +// validation: an explicitly-set auto_flush_bytes that exceeds an +// explicitly-set sf_max_bytes is rejected, because the byte trigger +// would let a batch grow until its encoded frame can no longer fit a +// single segment — an un-flushable pairing. The check is at sanitize, +// not parse, so it runs for both the connect-string and option paths. +func TestSfConfRejectsAutoFlushBytesAboveSfMaxBytes(t *testing.T) { + conf, err := confFromStr( + "ws::addr=localhost:9000;sf_dir=/tmp/sf;sf_max_bytes=1048576;auto_flush_bytes=2097152;") + require.NoError(t, err, "parser accepts both values; the contradiction is caught at sanitize") + require.True(t, conf.autoFlushBytesSet) + + err = sanitizeQwpConf(conf) + require.Error(t, err) + require.Contains(t, err.Error(), "auto_flush_bytes") + require.Contains(t, err.Error(), "sf_max_bytes") +} + +// TestSfConfRejectsAutoFlushBytesAboveSfMaxBytesViaOptions covers the +// functional-option set-site: WithAutoFlushBytes must record the +// explicit-set flag so the same sanitize guard fires. +func TestSfConfRejectsAutoFlushBytesAboveSfMaxBytesViaOptions(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + for _, opt := range []LineSenderOption{ + WithAddress("localhost:9000"), + WithSfDir("/tmp/sf"), + WithSfMaxBytes(1 << 20), + WithAutoFlushBytes(2 << 20), + } { + opt(conf) + } + require.True(t, conf.autoFlushBytesSet) + + err := sanitizeQwpConf(conf) + require.Error(t, err) + require.Contains(t, err.Error(), "auto_flush_bytes") + require.Contains(t, err.Error(), "sf_max_bytes") +} + +// TestSfConfAcceptsDefaultedAutoFlushBytesOverSmallSegment is the +// no-footgun case: lowering sf_max_bytes while leaving auto_flush_bytes +// at its 8 MiB default is NOT a user-written contradiction, so sanitize +// must accept it — the runtime clamp lowers the effective trigger to +// fit the smaller segment. Rejecting here would force users to hand-tune +// auto_flush_bytes every time they shrink a segment. +func TestSfConfAcceptsDefaultedAutoFlushBytesOverSmallSegment(t *testing.T) { + conf, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;sf_max_bytes=1048576;") + require.NoError(t, err) + require.False(t, conf.autoFlushBytesSet, "auto_flush_bytes left at default") + require.Equal(t, qwpDefaultAutoFlushBytes, conf.autoFlushBytes) + require.Greater(t, int64(conf.autoFlushBytes), conf.sfMaxBytes, + "precondition: the defaulted trigger exceeds the chosen segment") + + require.NoError(t, sanitizeQwpConf(conf), + "a defaulted trigger over a smaller segment is handled by the clamp, not rejected") +} + +// TestSfConfAcceptsAutoFlushBytesBelowSfMaxBytes pins that a valid +// explicit pairing (trigger at or below the segment) sanitizes cleanly. +func TestSfConfAcceptsAutoFlushBytesBelowSfMaxBytes(t *testing.T) { + conf, err := confFromStr( + "ws::addr=localhost:9000;sf_dir=/tmp/sf;sf_max_bytes=4194304;auto_flush_bytes=2097152;") + require.NoError(t, err) + require.NoError(t, sanitizeQwpConf(conf)) +} + +// TestSfConfInitialConnectRetryValues exercises every accepted spelling +// of `initial_connect_retry` (Java spec §4.2 / §13.4) and the rejected +// one. The legacy bool spellings (`on`/`true`/`off`/`false`) and the +// Java-aligned tri-state words (`sync`/`async`) must all parse; bogus +// values must be rejected with a message that names every accepted +// value so users know what to type. +func TestSfConfInitialConnectRetryValues(t *testing.T) { + cases := []struct { + raw string + want InitialConnectMode + }{ + {"on", InitialConnectSync}, + {"true", InitialConnectSync}, + {"sync", InitialConnectSync}, + {"off", InitialConnectOff}, + {"false", InitialConnectOff}, + {"async", InitialConnectAsync}, + } + for _, c := range cases { + t.Run(c.raw, func(t *testing.T) { + conf, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;initial_connect_retry=" + c.raw + ";") + require.NoError(t, err) + assert.Equal(t, c.want, conf.initialConnectMode) + }) + } +} + +func TestSfConfInitialConnectRetryRejectsBogusValue(t *testing.T) { + _, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;initial_connect_retry=maybe;") + require.Error(t, err) + // Error message must enumerate the accepted spellings so users + // porting from Java know `sync`/`async` are valid. + for _, want := range []string{"sync", "async", "on", "off", "true", "false"} { + assert.Contains(t, err.Error(), want) + } +} + +// TestSfConfReconnectKeyPromotesInitialConnect pins the implicit +// promotion documented in the connect-string reference: if the user +// tuned any reconnect_* knob but did not pick an initial_connect_retry +// mode, sanitize promotes the mode to sync so the reconnect budget +// actually covers the *first* connect attempt. Mirrors Java's +// actualInitialConnectMode resolution in Sender.java. +// +// confFromStr alone returns the parser's raw view (mode stays unset); +// the promotion fires in sanitizeQwpConf. The assertions below +// exercise both layers so future refactors can't silently relocate the +// promotion to a layer the option-builder path bypasses. +func TestSfConfReconnectKeyPromotesInitialConnect(t *testing.T) { + cases := []string{ + "reconnect_max_duration_millis=120000", + "reconnect_initial_backoff_millis=200", + "reconnect_max_backoff_millis=10000", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + conf, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;" + c + ";") + require.NoError(t, err) + // Parser keeps the user's view raw: the mode is unset and + // the default-zero InitialConnectOff still reads. + assert.False(t, conf.initialConnectModeSet) + assert.Equal(t, InitialConnectOff, conf.initialConnectMode) + // Sanitize promotes when no explicit mode was chosen. + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectSync, conf.initialConnectMode) + }) + } +} + +// Explicit initial_connect_retry=off paired with a tuned reconnect +// budget is a documented escape hatch: fail-fast on startup misconfig +// while still accepting a generous post-connect outage budget. The +// explicit choice must win over the promotion. +func TestSfConfInitialConnectRetryOffOverridesPromotion(t *testing.T) { + conf, err := confFromStr( + "ws::addr=localhost:9000;sf_dir=/tmp/sf;" + + "reconnect_max_duration_millis=120000;" + + "initial_connect_retry=off;") + require.NoError(t, err) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectOff, conf.initialConnectMode) +} + +// initial_connect_retry=async paired with a tuned reconnect budget +// also wins over the promotion — the explicit choice is preserved +// verbatim, not silently coerced to sync. +func TestSfConfInitialConnectRetryAsyncSurvivesPromotion(t *testing.T) { + conf, err := confFromStr( + "ws::addr=localhost:9000;sf_dir=/tmp/sf;" + + "reconnect_max_duration_millis=120000;" + + "initial_connect_retry=async;") + require.NoError(t, err) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectAsync, conf.initialConnectMode) +} + +// No reconnect_* knob set → no promotion. Defends against the +// promotion logic firing on the QWP defaults (which seed the +// reconnect fields lazily in the send loop, not at parse time). +func TestSfConfNoReconnectKeyNoPromotion(t *testing.T) { + conf, err := confFromStr("ws::addr=localhost:9000;sf_dir=/tmp/sf;") + require.NoError(t, err) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectOff, conf.initialConnectMode) +} + +// Functional-option parity for the promotion. WithReconnectPolicy on +// its own must promote to sync; an explicit WithInitialConnectRetry +// (or WithInitialConnectMode) must win over it. This is the option +// path the Go builder API exposes, separate from the connect string. +func TestSfOptionsWithReconnectPolicyPromotes(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithReconnectPolicy(2*time.Minute, 100*time.Millisecond, 5*time.Second)(conf) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectSync, conf.initialConnectMode) +} + +func TestSfOptionsWithInitialConnectRetryOffOverridesPromotion(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithReconnectPolicy(2*time.Minute, 100*time.Millisecond, 5*time.Second)(conf) + WithInitialConnectRetry(false)(conf) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectOff, conf.initialConnectMode) +} + +func TestSfOptionsWithInitialConnectModeAsyncSurvivesPromotion(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithReconnectPolicy(2*time.Minute, 100*time.Millisecond, 5*time.Second)(conf) + WithInitialConnectMode(InitialConnectAsync)(conf) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectAsync, conf.initialConnectMode) +} + +// WithReconnectPolicy with non-positive durations must be a no-op for +// the corresponding *Set flags, so it does not register as an explicit +// reconnect tune and does not trigger the initial_connect_retry +// promotion. Zero / negative values fall back to the defaults at +// consumption time (qwp_sender_cursor.go), so the same applies here. +func TestSfOptionsWithReconnectPolicyZeroDoesNotPromote(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithReconnectPolicy(0, 0, 0)(conf) + assert.False(t, conf.reconnectMaxDurationMillisSet) + assert.False(t, conf.reconnectInitialBackoffMillisSet) + assert.False(t, conf.reconnectMaxBackoffMillisSet) + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectOff, conf.initialConnectMode) +} + +// Per-knob: only the positive arguments register as explicit user +// choices; the rest stay unset and continue to draw the default. +func TestSfOptionsWithReconnectPolicyMixedZeroOnlySetsPositive(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + WithSfDir("/tmp/sf")(conf) + WithReconnectPolicy(0, 250*time.Millisecond, 0)(conf) + assert.False(t, conf.reconnectMaxDurationMillisSet) + assert.True(t, conf.reconnectInitialBackoffMillisSet) + assert.Equal(t, 250, conf.reconnectInitialBackoffMillis) + assert.False(t, conf.reconnectMaxBackoffMillisSet) + // One positive knob is enough to register as an explicit reconnect + // tune, so the promotion still fires here. + require.NoError(t, sanitizeQwpConf(conf)) + assert.Equal(t, InitialConnectSync, conf.initialConnectMode) +} + +func TestSanitizeQwpConfRejectsSfKeysWithoutSfDir(t *testing.T) { + cases := []func(c *lineSenderConfig){ + func(c *lineSenderConfig) { c.senderId = "x" }, + func(c *lineSenderConfig) { c.sfMaxBytes = 1 << 20 }, + func(c *lineSenderConfig) { c.sfMaxTotalBytes = 1 << 30 }, + func(c *lineSenderConfig) { c.sfDurability = "memory" }, + func(c *lineSenderConfig) { c.sfAppendDeadlineMillis = 5000 }, + func(c *lineSenderConfig) { c.drainOrphans = true }, + func(c *lineSenderConfig) { c.maxBackgroundDrainers = 4 }, + } + for i, mut := range cases { + t.Run(fmt.Sprintf("case-%d", i), func(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + conf.address = "localhost:9000" + mut(conf) + err := sanitizeQwpConf(conf) + require.Error(t, err) + assert.Contains(t, err.Error(), "sf_dir") + }) + } +} + +func TestSanitizeQwpConfRejectsTotalLessThanSegment(t *testing.T) { + conf := newLineSenderConfig(qwpSenderType) + conf.address = "localhost:9000" + conf.sfDir = "/tmp/sf" + conf.sfMaxBytes = 1 << 20 + conf.sfMaxTotalBytes = 1 << 18 + err := sanitizeQwpConf(conf) + require.Error(t, err) + assert.Contains(t, err.Error(), "sf_max_total_bytes") +} + +// TestSfConfEndToEnd builds a sender from a connect string with +// sf_dir set, sends rows through it, closes, and confirms the +// fake server saw the frames AND the slot dir was created on disk. +func TestSfConfEndToEnd(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + tmp := t.TempDir() + addr := strings.TrimPrefix(srv.URL, "http://") + confStr := strings.Join([]string{ + "ws::addr=" + addr, + "sf_dir=" + tmp, + "sender_id=test-slot", + "sf_max_bytes=4096", + "sf_max_total_bytes=" + fmt.Sprintf("%d", int64(64*1024)), + "close_flush_timeout_millis=5000;", + }, ";") + + ls, err := LineSenderFromConf(context.Background(), confStr) + require.NoError(t, err) + + for i := 0; i < 5; i++ { + require.NoError(t, ls.Table("t").Int64Column("v", int64(i)).AtNow(context.Background())) + } + require.NoError(t, ls.Close(context.Background())) + + // The slot dir must have been created. + st, err := os.Stat(filepath.Join(tmp, "test-slot")) + require.NoError(t, err) + assert.True(t, st.IsDir()) + // On clean drain, residual .sfa files are unlinked. The .lock + // file may remain (it's not unlinked on close). + entries, err := os.ReadDir(filepath.Join(tmp, "test-slot")) + require.NoError(t, err) + for _, e := range entries { + assert.NotEqual(t, ".sfa", filepath.Ext(e.Name()), + "unexpected leftover segment file %s", e.Name()) + } + // Server received at least one frame. + assert.GreaterOrEqual(t, srv.totalFramesReceived.Load(), int64(1)) +} + +// TestQwpIngressAcceptsTargetInert is the end-to-end M10 regression +// guard: a connect string with target=primary (or replica) must +// connect and deliver rows on the ingress path, not storm. The +// ingestion path does not route by server role (role-based selection +// is egress-only), so target= is accepted but inert — every reachable +// host binds, symmetric with zone=. The flush+ACK barrier is the +// assertion: it only completes once the send loop binds a host and the +// server ACKs. Were target= "enforced" on this path — which never +// evaluates the role — the round-walk would reject every upgrade and +// re-sweep until the reconnect budget expired, so this barrier would +// hang until timeout. +func TestQwpIngressAcceptsTargetInert(t *testing.T) { + for _, target := range []string{"primary", "replica"} { + t.Run("target="+target, func(t *testing.T) { + srv := newQwpTestServer(t) // ACKs every frame + defer srv.Close() + addr := strings.TrimPrefix(srv.URL, "http://") + + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";target="+target+";") + require.NoError(t, err) + defer ls.Close(context.Background()) + + s, ok := ls.(*qwpLineSender) + require.True(t, ok, "LineSenderFromConf must yield a *qwpLineSender") + + require.NoError(t, + s.Table("t").Int64Column("v", 1).AtNow(context.Background())) + flushAndAwaitAck(t, s) + }) + } +} + +func TestSfConfPicksDefaultSenderIdWhenUnset(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + tmp := t.TempDir() + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";sf_dir="+tmp+";close_flush_timeout_millis=2000;") + require.NoError(t, err) + require.NoError(t, ls.Close(context.Background())) + // Default sender_id is "default". + st, err := os.Stat(filepath.Join(tmp, "default")) + require.NoError(t, err) + assert.True(t, st.IsDir()) +} + +func TestSfConfWithSfDirOptionBuilder(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + tmp := t.TempDir() + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := NewLineSender(context.Background(), + WithQwp(), + WithAddress(addr), + WithSfDir(tmp), + WithSenderId("opt-builder"), + WithCloseFlushTimeout(2*time.Second), + ) + require.NoError(t, err) + require.NoError(t, ls.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls.Close(context.Background())) + st, err := os.Stat(filepath.Join(tmp, "opt-builder")) + require.NoError(t, err) + assert.True(t, st.IsDir()) +} + +// reserveLocalPort grabs a free TCP port and immediately releases it. +// The returned address is suitable for "no server is listening here" +// scenarios — between the release and the test using the address, +// another process *could* in principle grab the port, but for short- +// lived test windows on localhost this is reliable enough in practice. +func reserveLocalPort(t *testing.T) string { + t.Helper() + l, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := l.Addr().String() + require.NoError(t, l.Close()) + return addr +} + +// TestSfConfInitialConnectAsyncReturnsImmediately is the headline +// behavior of `initial_connect_retry=async`: LineSenderFromConf must +// return immediately even when no server is reachable. The I/O +// goroutine retries connect in the background; the producer is +// unblocked. With `reconnect_max_duration_millis=60000`, anything +// that waited on connect would hang the test for a minute — assert a +// sub-second construction time instead. +func TestSfConfInitialConnectAsyncReturnsImmediately(t *testing.T) { + tmp := t.TempDir() + addr := reserveLocalPort(t) + cfg := strings.Join([]string{ + "ws::addr=" + addr, + "sf_dir=" + tmp, + "initial_connect_retry=async", + "reconnect_max_duration_millis=60000", + "reconnect_initial_backoff_millis=10", + "reconnect_max_backoff_millis=50", + // Fast close: don't block on a drain that can't complete + // without a server. + "close_flush_timeout_millis=0;", + }, ";") + + t0 := time.Now() + ls, err := LineSenderFromConf(context.Background(), cfg) + require.NoError(t, err) + elapsed := time.Since(t0) + assert.Less(t, elapsed, 2*time.Second, + "LineSenderFromConf must return immediately in async mode (took %s)", elapsed) + + // Producer-side calls work without a live wire — frames accumulate + // on the cursor SF engine while the I/O goroutine is still trying + // to connect. + require.NoError(t, ls.Table("foo").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, ls.Close(context.Background())) +} + +// TestSfConfInitialConnectAsyncDeliversWhenServerComesUp covers the +// late-arrival flow: the sender opens before the server is listening, +// the producer publishes a row to the cursor SF engine, then the +// server starts. The buffered frame must be delivered and ACKed by +// the I/O goroutine once the wire is up. +// +// Also pins the post-v4.2.0 flush contract (Java decision #1): with +// the server still down, FlushAndGetSequence must NOT block on the +// ACK — it returns the published FSN immediately because the frame +// is already durable in the SF engine. AwaitAckedFsn is the +// dedicated barrier that blocks until the I/O loop delivers it and +// the server ACKs. +func TestSfConfInitialConnectAsyncDeliversWhenServerComesUp(t *testing.T) { + // Reserve a port and bind a listener on it that we'll later wrap + // with httptest. By holding the port across the gap we avoid the + // race where another process could steal it between reserve and + // re-bind. + listener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := listener.Addr().String() + + tmp := t.TempDir() + cfg := strings.Join([]string{ + "ws::addr=" + addr, + "sf_dir=" + tmp, + "initial_connect_retry=async", + "reconnect_max_duration_millis=10000", + "reconnect_initial_backoff_millis=20", + "reconnect_max_backoff_millis=200", + "close_flush_timeout_millis=5000;", + }, ";") + + ls, err := LineSenderFromConf(context.Background(), cfg) + require.NoError(t, err) + defer func() { _ = ls.Close(context.Background()) }() + + qs, ok := ls.(QwpSender) + require.True(t, ok, "QWP sender must satisfy QwpSender") + + // Append a row before the server is up. The frame lands in the + // cursor SF engine; the I/O goroutine is still retrying connect. + require.NoError(t, qs.Table("foo").Int64Column("v", 42).AtNow(context.Background())) + + // FlushAndGetSequence must return promptly even though the server + // is still down: the frame is durable in the SF engine and flush + // no longer blocks on the ACK. Bound it tightly so a regression + // back to ACK-barrier semantics fails loudly here. + flushCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + fsn, err := qs.FlushAndGetSequence(flushCtx) + require.NoError(t, err, "FlushAndGetSequence must not block on ACK while the server is down") + require.GreaterOrEqual(t, fsn, int64(0)) + + // Bring the server up on the held port. Use the same handler as + // the standard test server (just enough to ACK frames). + srv := newQwpSfTestServerOnListener(t, listener) + defer srv.Close() + + // AwaitAckedFsn is the delivery barrier: block until the I/O loop + // has delivered the buffered frame and the server ACKed it. + awaitCtx, awaitCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer awaitCancel() + require.NoError(t, qs.AwaitAckedFsn(awaitCtx, fsn), + "buffered frame must be delivered and ACKed once the server is up") + assert.GreaterOrEqual(t, srv.totalFramesReceived.Load(), int64(1)) +} diff --git a/qwp_sf_dispatcher.go b/qwp_sf_dispatcher.go new file mode 100644 index 00000000..dded8682 --- /dev/null +++ b/qwp_sf_dispatcher.go @@ -0,0 +1,425 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "log" + "runtime" + "strconv" + "sync" + "sync/atomic" + "time" +) + +// qwpSfDefaultErrorInboxCapacity is the default size of the bounded +// inbox connecting the I/O goroutine to the user-handler dispatcher +// goroutine. Java spec § "Configuration knobs" sets the same value. +const qwpSfDefaultErrorInboxCapacity = 256 + +// qwpSfMinErrorInboxCapacity is the floor enforced on user-supplied +// capacities by the connect-string sanitizer per the spec. +const qwpSfMinErrorInboxCapacity = 16 + +// qwpSfDispatcherDrainTimeout is the maximum time close() waits for +// the dispatcher loop to finish draining queued errors before giving +// up and abandoning anything still in the inbox. +const qwpSfDispatcherDrainTimeout = 100 * time.Millisecond + +// qwpSfDispatcherCloseJoinTimeout bounds how long close() waits to +// join the dispatch goroutine after signalling done. A healthy +// goroutine finishes its in-flight handler call and its own bounded +// drain() well within this budget. A user handler wedged in a +// never-returning call leaves the goroutine parked in deliver(), so +// it never observes done and never calls wg.Done(); the bound lets +// close() abandon that goroutine instead of blocking on it forever. +// Larger than qwpSfDispatcherDrainTimeout so a handler that is merely +// slow (not wedged) still joins cleanly. +const qwpSfDispatcherCloseJoinTimeout = 2 * qwpSfDispatcherDrainTimeout + +// qwpSfErrorDispatcher is the off-I/O delivery channel for SenderError +// notifications. The I/O goroutine offers errors non-blockingly into a +// bounded channel; a dedicated goroutine drains the channel and +// invokes the user-supplied SenderErrorHandler. A slow handler does +// not stall publishing — overflow displaces the oldest queued entry +// (sf-client.md §14.6) and bumps droppedNotifications. +// +// The dispatcher goroutine is started lazily on the first successful +// offer, so workloads that never see a server error pay zero +// goroutine cost. +type qwpSfErrorDispatcher struct { + handler SenderErrorHandler + + // inbox is the bounded delivery channel. Capacity is set at + // construction; never resized. + inbox chan *SenderError + + // done is closed by close() to signal the loop should drain and + // exit. Closing the inbox would race with offer; instead the + // loop polls done. + done chan struct{} + + // mu serializes offer vs close. offer holds it from the closed + // check through the channel send; close holds it across the + // CAS that flips closed=true and the close(done) call. This + // makes the closed-flag check and the channel send atomic with + // respect to close — a producer that read closed=false cannot + // then have its send land after close has already drained. + mu sync.Mutex + + // startMu serializes lazy-start. Combined with started.Load(), + // it ensures the goroutine spawns exactly once. + startMu sync.Mutex + + // started flips true after the dispatch goroutine is launched. + started atomic.Bool + + // closed flips true on close(). offer() short-circuits to drop + // when closed. + closed atomic.Bool + + dropped atomic.Int64 + delivered atomic.Int64 + + // loopGoid is the goroutine ID of loop(), stored when it starts + // and cleared (back to 0) when it exits. close() compares the + // caller's goid against it to detect a re-entrant shutdown: a + // SenderErrorHandler that calls Close() — or swaps the handler, + // routing through sendLoopSetErrorHandler -> old.close() — runs + // inside deliver() *on this goroutine*. A wg.Wait() from there + // would join the loop goroutine to itself and hang forever. 0 + // never matches a real goid, so a close() before loop() starts + // (or after it exits) takes the normal waiting path. + loopGoid atomic.Int64 + + // wg waits for the dispatch goroutine to exit during close(). + wg sync.WaitGroup +} + +// newQwpSfErrorDispatcher constructs a dispatcher with the given +// handler and inbox capacity. handler must be non-nil; capacity must +// be ≥ 1 (the connect-string sanitizer separately enforces ≥ 16 for +// user-supplied values, but internal callers like tests and the +// silent-default constructor are allowed smaller buffers). +func newQwpSfErrorDispatcher(handler SenderErrorHandler, capacity int) *qwpSfErrorDispatcher { + if handler == nil { + handler = defaultSenderErrorHandler + } + if capacity < 1 { + capacity = qwpSfDefaultErrorInboxCapacity + } + return &qwpSfErrorDispatcher{ + handler: handler, + inbox: make(chan *SenderError, capacity), + done: make(chan struct{}), + } +} + +// offer enqueues a SenderError for asynchronous delivery to the +// handler. Always admits the new entry unless the dispatcher is +// closed or e is nil. When the inbox is full, the oldest queued +// entry is displaced to make room (drop-oldest per sf-client.md +// §14.6 — watermarks are monotonic, so the newest entry is always +// the most informative). Each displacement bumps droppedNotifications. +// +// Holds mu across the closed-check, send, and any drop step so close +// cannot interleave. Lazy-starts the dispatch goroutine on the first +// call. Returns true when the new entry is queued, false only when +// the dispatcher is closed or e is nil. +func (d *qwpSfErrorDispatcher) offer(e *SenderError) bool { + if d == nil || e == nil { + return false + } + d.mu.Lock() + defer d.mu.Unlock() + if d.closed.Load() { + return false + } + if !d.started.Load() { + d.startIfNeeded() + } + // Drop-oldest overflow. We hold mu so no concurrent producer can + // run; only the consumer goroutine races with our receive step, + // and it can only remove items. The loop converges in ≤2 iters: + // either our receive drops the head and the retry send succeeds, + // or the consumer drained between the failed send and our receive + // (default fires) and the retry succeeds without counting a drop. + for { + select { + case d.inbox <- e: + return true + default: + } + select { + case <-d.inbox: + d.dropped.Add(1) + default: + } + } +} + +// startIfNeeded launches the dispatch goroutine if it hasn't been +// already. Idempotent under contention. +func (d *qwpSfErrorDispatcher) startIfNeeded() { + d.startMu.Lock() + defer d.startMu.Unlock() + if d.started.Load() || d.closed.Load() { + return + } + d.wg.Add(1) + d.started.Store(true) + go d.loop() +} + +// loop is the dispatch goroutine body. It ranges over the inbox +// until close() signals via done; on shutdown it drains any +// remaining queued errors with a short deadline before returning. +// +// Handler panics are recovered and logged; the dispatcher and +// sender continue running. +func (d *qwpSfErrorDispatcher) loop() { + defer d.wg.Done() + // Publish our goroutine identity before the first deliver() so a + // handler that re-enters close() on this goroutine is recognized. + // Cleared on exit so a later close() never matches a stale id. + d.loopGoid.Store(qwpGoid()) + defer d.loopGoid.Store(0) + for { + select { + case e := <-d.inbox: + if e == nil { + continue + } + d.deliver(e) + case <-d.done: + d.drain() + return + } + } +} + +// drain delivers any errors still in the inbox after close. Two +// exit paths: the inbox is empty (the common case — by the time +// drain runs, closed.Load() is true and producers stop offering), +// or qwpSfDispatcherDrainTimeout fires (a slow handler is still +// chewing through queued items). With offer/close serialized +// through mu, no new sends can land here once close has run, so +// the inbox is guaranteed to go quiet. +func (d *qwpSfErrorDispatcher) drain() { + deadline := time.NewTimer(qwpSfDispatcherDrainTimeout) + defer deadline.Stop() + for { + select { + case e := <-d.inbox: + if e == nil { + continue + } + d.deliver(e) + case <-deadline.C: + return + default: + return + } + } +} + +// deliver invokes the handler under a panic guard, bumping the +// delivered counter unconditionally — a handler panic still counts +// as "we attempted delivery" for ops visibility. +func (d *qwpSfErrorDispatcher) deliver(e *SenderError) { + d.delivered.Add(1) + defer func() { + if r := recover(); r != nil { + log.Printf("[ERROR] qwp/sf: error handler panicked on %s: %v", e, r) + } + }() + d.handler(e) +} + +// close stops the dispatch goroutine and joins it within a bounded +// budget. Idempotent — second and subsequent calls are no-ops. +// +// Acquires mu before flipping closed and closing done, so any +// in-flight offer either commits its send first (and gets handled +// below) or sees closed=true and returns false. +// +// Paths after signalling done: +// +// - Caller is the loop goroutine itself (a handler re-entering +// close): the re-entrant guard returns immediately. Joining here +// would self-deadlock; loop() unwinds the handler, observes done, +// and runs its own bounded drain(). +// +// - Goroutine never started (no offer ever succeeded, or only +// direct inbox injection in tests): drain() here delivers any +// queued items within the bounded budget. +// +// - Goroutine ran: join it, bounded by +// qwpSfDispatcherCloseJoinTimeout. A handler wedged in a +// never-returning call keeps loop() parked in deliver(), so the +// join times out and the goroutine is abandoned rather than hung +// on. Whatever is still queued — abandoned by drain()'s own +// timeout or never reached by a wedged handler — is counted as +// dropped, since re-delivering would defeat the bound. Together +// these make qwpSfDispatcherCloseJoinTimeout a hard ceiling on +// close() blocking time. +func (d *qwpSfErrorDispatcher) close() { + if d == nil { + return + } + d.mu.Lock() + if !d.closed.CompareAndSwap(false, true) { + d.mu.Unlock() + return + } + close(d.done) + started := d.started.Load() + d.mu.Unlock() + + // Re-entrant shutdown guard. A SenderErrorHandler invoked by + // deliver() on the loop goroutine is allowed to call Close() + // (or swap the handler, which routes through + // sendLoopSetErrorHandler -> old.close()). Both land here on + // this very goroutine. wg.Wait() would block until loop() calls + // wg.Done(), but loop() is the current goroutine, suspended in + // the handler frame below this call — a permanent self-join that + // no timeout escapes. done is already closed above, so once the + // handler stack unwinds, loop() observes done, runs its own + // bounded drain(), and exits cleanly. Skip the wait (and the + // post-wait inbox sweep, which would race loop()'s drain) and + // return. Non-loop callers fall through to the normal path. The + // g != 0 check keeps a goid parse failure (returns 0) from + // matching the loopGoid==0 "not running" sentinel. + if g := qwpGoid(); g != 0 && d.loopGoid.Load() == g { + return + } + + if !started { + // The dispatch goroutine never launched (no offer ever + // succeeded, or only direct inbox injection in tests). No + // loop/drain ran, so deliver any queued items here within the + // bounded drain budget. + d.drain() + return + } + + // Join the dispatch goroutine, bounded by + // qwpSfDispatcherCloseJoinTimeout. loop() observes done, runs its + // own bounded drain(), and calls wg.Done() — normally well within + // the budget. A handler wedged in a never-returning call keeps + // loop() parked in deliver() so wg.Done() never fires; the bound + // abandons that goroutine rather than inheriting its hang. + joined := make(chan struct{}) + go func() { + d.wg.Wait() + close(joined) + }() + timer := time.NewTimer(qwpSfDispatcherCloseJoinTimeout) + defer timer.Stop() + select { + case <-joined: + case <-timer.C: + log.Printf("[WARN] qwp/sf: error handler still running %s after close; "+ + "abandoning dispatcher goroutine and dropping queued notifications", + qwpSfDispatcherCloseJoinTimeout) + } + + // Sweep whatever remains queued — items drain() abandoned via its + // own timeout, or never reached because the handler is wedged. + // Re-delivering would defeat the close-time bound, so count them + // as dropped. + for { + select { + case e := <-d.inbox: + if e != nil { + d.dropped.Add(1) + } + default: + return + } + } +} + +// droppedNotifications returns the cumulative count of inbox-overflow +// displacements (drop-oldest) plus any items abandoned at close(). +// Non-zero means the user's handler is slower than the error rate. +func (d *qwpSfErrorDispatcher) droppedNotifications() int64 { + if d == nil { + return 0 + } + return d.dropped.Load() +} + +// totalDelivered returns the cumulative count of errors delivered to +// the handler (including those where the handler panicked). +func (d *qwpSfErrorDispatcher) totalDelivered() int64 { + if d == nil { + return 0 + } + return d.delivered.Load() +} + +// defaultSenderErrorHandler is the loud-not-silent fallback used when +// the user has not registered a handler. ERROR for HALT, WARN for +// DROP — both with the full structured payload. Per Java spec +// § "Loud defaults — silence is forbidden". +func defaultSenderErrorHandler(e *SenderError) { + if e == nil { + return + } + level := "[ERROR]" + if e.AppliedPolicy == PolicyDropAndContinue { + level = "[WARN]" + } + log.Printf("%s qwp/sf: %s", level, e) +} + +// qwpGoid returns the numeric ID of the calling goroutine, or 0 if it +// cannot be parsed. Go exposes goroutine identity only through the +// runtime.Stack header ("goroutine []:"); there is no +// public accessor. This is used solely by the dispatcher's re-entrant +// close() guard — a SenderErrorHandler that calls Close() runs on the +// dispatcher loop goroutine and a blocking join from there would +// self-deadlock. The cost (one fixed-size runtime.Stack of the current +// goroutine only) is paid once at loop() start and on close(), never +// on the publish/encode hot path. +func qwpGoid() int64 { + var buf [64]byte + n := runtime.Stack(buf[:], false) + const prefix = "goroutine " + b := buf[:n] + if len(b) < len(prefix) { + return 0 + } + b = b[len(prefix):] + i := 0 + for i < len(b) && b[i] >= '0' && b[i] <= '9' { + i++ + } + id, err := strconv.ParseInt(string(b[:i]), 10, 64) + if err != nil { + return 0 + } + return id +} diff --git a/qwp_sf_dispatcher_test.go b/qwp_sf_dispatcher_test.go new file mode 100644 index 00000000..20eb36a3 --- /dev/null +++ b/qwp_sf_dispatcher_test.go @@ -0,0 +1,471 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "sync" + "sync/atomic" + "testing" + "time" +) + +// TestQwpSfDispatcherDeliversInOrder asserts the dispatcher delivers +// queued errors to the handler FIFO and counts each delivery. +func TestQwpSfDispatcherDeliversInOrder(t *testing.T) { + var got []*SenderError + var mu sync.Mutex + done := make(chan struct{}, 3) + d := newQwpSfErrorDispatcher(func(e *SenderError) { + mu.Lock() + got = append(got, e) + mu.Unlock() + done <- struct{}{} + }, 8) + defer d.close() + + es := []*SenderError{ + {Category: CategoryParseError}, + {Category: CategoryWriteError}, + {Category: CategorySchemaMismatch}, + } + for _, e := range es { + if !d.offer(e) { + t.Fatalf("offer dropped a non-full inbox") + } + } + for range es { + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("handler not invoked in time") + } + } + mu.Lock() + defer mu.Unlock() + if len(got) != len(es) { + t.Fatalf("got %d, want %d", len(got), len(es)) + } + for i := range es { + if got[i] != es[i] { + t.Errorf("got[%d]=%v, want %v", i, got[i], es[i]) + } + } + if d.totalDelivered() != int64(len(es)) { + t.Errorf("delivered = %d, want %d", d.totalDelivered(), len(es)) + } + if d.droppedNotifications() != 0 { + t.Errorf("dropped = %d, want 0", d.droppedNotifications()) + } +} + +// TestQwpSfDispatcherSlowHandlerDropsOldest asserts that when a slow +// handler causes the inbox to fill, the OLDEST queued entry is +// displaced to admit the new one (sf-client.md §14.6). Every offer +// must be admitted; only previously queued entries are displaced; +// the inbox at end-of-flood must contain the most recent items. +func TestQwpSfDispatcherSlowHandlerDropsOldest(t *testing.T) { + release := make(chan struct{}) + handlerStarted := make(chan struct{}) + var mu sync.Mutex + var delivered []*SenderError + var firstOnce sync.Once + d := newQwpSfErrorDispatcher(func(e *SenderError) { + firstOnce.Do(func() { close(handlerStarted) }) + mu.Lock() + delivered = append(delivered, e) + mu.Unlock() + <-release + }, 4) + + items := make([]*SenderError, 9) + for i := range items { + items[i] = &SenderError{Category: CategoryParseError, ToFsn: int64(i)} + } + + // First offer lazy-starts the dispatcher. Wait until the handler + // has actually pulled item 0 so the inbox is verifiably empty + // before we fill it. + if !d.offer(items[0]) { + t.Fatal("first offer rejected on empty inbox") + } + select { + case <-handlerStarted: + case <-time.After(2 * time.Second): + t.Fatal("handler did not start within timeout") + } + + // Fill the inbox to capacity (4) without overflowing. + for i := 1; i <= 4; i++ { + if !d.offer(items[i]) { + t.Fatalf("offer %d rejected on non-full inbox", i) + } + } + if got := d.droppedNotifications(); got != 0 { + t.Fatalf("dropped = %d before overflow, want 0", got) + } + + // Offer 4 more. Drop-oldest must admit each one and displace the + // oldest entry that was queued. + for i := 5; i <= 8; i++ { + if !d.offer(items[i]) { + t.Fatalf("offer %d rejected (drop-oldest must admit every offer)", i) + } + } + if got, want := d.droppedNotifications(), int64(4); got != want { + t.Errorf("dropped = %d, want %d (one per overflow offer)", got, want) + } + + // Release the handler and drain. Item 0 was already in the handler + // when the flood started; items 1-4 should have been displaced; + // items 5-8 should still be queued. Total delivered: 5. + close(release) + d.close() + + mu.Lock() + defer mu.Unlock() + if len(delivered) != 5 { + t.Fatalf("delivered = %d, want 5 (item 0 + 4 newest)", len(delivered)) + } + wantFsns := []int64{0, 5, 6, 7, 8} + for i, want := range wantFsns { + if delivered[i].ToFsn != want { + t.Errorf("delivered[%d] ToFsn = %d, want %d (drop-oldest must preserve newest)", + i, delivered[i].ToFsn, want) + } + } +} + +// TestQwpSfDispatcherCloseIsIdempotent asserts close() can be called +// multiple times without panicking or leaking goroutines. +func TestQwpSfDispatcherCloseIsIdempotent(t *testing.T) { + d := newQwpSfErrorDispatcher(func(e *SenderError) {}, 4) + d.close() + d.close() // must not panic + if d.offer(&SenderError{}) { + t.Fatal("offer succeeded on closed dispatcher") + } +} + +// TestQwpSfDispatcherCloseDrainsLeftover asserts that an item in the +// inbox at close time is delivered even when the dispatcher goroutine +// never started. Reproduces the never-started race: in production +// offer's send-to-inbox can complete before its startIfNeeded call, +// and a close() that wins the closed flag between those two steps +// would otherwise strand the queued payload. +func TestQwpSfDispatcherCloseDrainsLeftover(t *testing.T) { + var got *SenderError + var mu sync.Mutex + d := newQwpSfErrorDispatcher(func(e *SenderError) { + mu.Lock() + got = e + mu.Unlock() + }, 4) + + want := &SenderError{Category: CategoryParseError, AppliedPolicy: PolicyHalt} + d.inbox <- want + if d.started.Load() { + t.Fatal("test setup: dispatcher unexpectedly started") + } + + d.close() + + mu.Lock() + defer mu.Unlock() + if got != want { + t.Fatalf("got = %v, want %v — close did not synchronously drain", got, want) + } + if d.totalDelivered() != 1 { + t.Errorf("delivered = %d, want 1", d.totalDelivered()) + } +} + +// TestQwpSfDispatcherOfferCloseRaceNoLoss stresses the offer/close +// serialization: every offer that returns true must result in a +// delivered handler invocation, even when close races with offers +// from many goroutines. Verifies mu prevents a producer's send from +// landing in an abandoned inbox after close has drained. +func TestQwpSfDispatcherOfferCloseRaceNoLoss(t *testing.T) { + const iterations = 200 + const offerers = 16 + for iter := 0; iter < iterations; iter++ { + var delivered atomic.Int64 + d := newQwpSfErrorDispatcher(func(e *SenderError) { + delivered.Add(1) + }, offerers*2) + + var accepted atomic.Int64 + var wg sync.WaitGroup + start := make(chan struct{}) + for k := 0; k < offerers; k++ { + wg.Add(1) + go func() { + defer wg.Done() + <-start + if d.offer(&SenderError{Category: CategoryParseError}) { + accepted.Add(1) + } + }() + } + wg.Add(1) + go func() { + defer wg.Done() + <-start + d.close() + }() + close(start) + wg.Wait() + + if got, want := delivered.Load(), accepted.Load(); got != want { + t.Fatalf("iter %d: delivered=%d, accepted=%d (lost %d)", + iter, got, want, want-got) + } + } +} + +// TestQwpSfDispatcherPanicCaught asserts a panicking handler is +// recovered and does not stop the dispatcher. +func TestQwpSfDispatcherPanicCaught(t *testing.T) { + var calls atomic.Int64 + d := newQwpSfErrorDispatcher(func(e *SenderError) { + calls.Add(1) + if calls.Load() == 1 { + panic("boom") + } + }, 4) + defer d.close() + + d.offer(&SenderError{Category: CategoryParseError}) + d.offer(&SenderError{Category: CategoryWriteError}) + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if calls.Load() >= 2 { + break + } + time.Sleep(5 * time.Millisecond) + } + if calls.Load() < 2 { + t.Fatalf("dispatcher stopped after panic: calls=%d", calls.Load()) + } + if d.totalDelivered() < 2 { + t.Errorf("delivered = %d, want ≥ 2 (panic counts as delivery)", + d.totalDelivered()) + } +} + +// TestQwpSfDispatcherLazyStart asserts no goroutine is spawned until +// the first successful offer. +func TestQwpSfDispatcherLazyStart(t *testing.T) { + d := newQwpSfErrorDispatcher(func(e *SenderError) {}, 4) + if d.started.Load() { + t.Fatal("dispatcher started before any offer") + } + d.offer(&SenderError{Category: CategoryParseError}) + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + if d.started.Load() { + break + } + time.Sleep(time.Millisecond) + } + if !d.started.Load() { + t.Fatal("dispatcher did not start after offer") + } + d.close() +} + +// TestQwpSfDispatcherNilHandlerUsesDefault asserts a nil handler +// falls through to the loud-not-silent default rather than panicking. +func TestQwpSfDispatcherNilHandlerUsesDefault(t *testing.T) { + d := newQwpSfErrorDispatcher(nil, 4) + defer d.close() + d.offer(&SenderError{ + Category: CategoryParseError, + AppliedPolicy: PolicyHalt, + }) + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + if d.totalDelivered() >= 1 { + return + } + time.Sleep(time.Millisecond) + } + t.Fatalf("default handler not invoked: delivered=%d", d.totalDelivered()) +} + +// TestQwpSfDispatcherNilOfferIsNoop asserts that offer(nil) returns +// false without affecting counters. +func TestQwpSfDispatcherNilOfferIsNoop(t *testing.T) { + d := newQwpSfErrorDispatcher(func(e *SenderError) {}, 4) + defer d.close() + if d.offer(nil) { + t.Fatal("offer(nil) returned true") + } + if d.droppedNotifications() != 0 { + t.Errorf("nil offer should not bump dropped: %d", d.droppedNotifications()) + } +} + +// TestQwpSfDispatcherCloseFromHandlerNoSelfJoin is a regression test +// for the self-join deadlock: a SenderErrorHandler that calls the +// sender's Close() runs inside deliver() on the dispatcher loop +// goroutine, and Close() funnels into dispatcher.close(). Before the +// fix, close()'s unbounded wg.Wait() waited for loop() to exit while +// loop() was suspended in the handler frame beneath that wait — a +// permanent hang no timeout escaped. close() must recognize the +// re-entrant caller, return without waiting, and let loop() unwind +// itself once the handler stack returns. +func TestQwpSfDispatcherCloseFromHandlerNoSelfJoin(t *testing.T) { + var d *qwpSfErrorDispatcher + returned := make(chan struct{}) + d = newQwpSfErrorDispatcher(func(e *SenderError) { + d.close() // re-entrant: runs on the loop goroutine + close(returned) + }, 4) + + if !d.offer(&SenderError{Category: CategoryParseError, AppliedPolicy: PolicyHalt}) { + t.Fatal("offer rejected on a fresh dispatcher") + } + + select { + case <-returned: + // close() returned to the handler — no self-join. + case <-time.After(2 * time.Second): + t.Fatal("Close() from handler deadlocked (self-join on the dispatcher loop goroutine)") + } + + // Fully closed: further offers rejected, and the loop goroutine + // terminates (wg released) shortly after the handler unwinds. + if d.offer(&SenderError{Category: CategoryParseError}) { + t.Fatal("offer accepted after re-entrant close") + } + loopExited := make(chan struct{}) + go func() { d.wg.Wait(); close(loopExited) }() + select { + case <-loopExited: + case <-time.After(2 * time.Second): + t.Fatal("loop goroutine did not exit after re-entrant close") + } + d.close() // idempotent re-close from the test goroutine must not hang +} + +// TestQwpSfDispatcherExternalCloseStillJoinsLoop guards against the +// re-entrancy fix over-firing: a close() from a goroutine other than +// the loop's must still block until the loop goroutine has exited, so +// callers that free resources after Close() returns stay safe. +func TestQwpSfDispatcherExternalCloseStillJoinsLoop(t *testing.T) { + release := make(chan struct{}) + var inHandler atomic.Bool + d := newQwpSfErrorDispatcher(func(e *SenderError) { + inHandler.Store(true) + <-release // pin the loop goroutine inside deliver() + }, 4) + + if !d.offer(&SenderError{Category: CategoryParseError}) { + t.Fatal("offer rejected on a fresh dispatcher") + } + deadline := time.Now().Add(2 * time.Second) + for !inHandler.Load() { + if time.Now().After(deadline) { + t.Fatal("handler never invoked") + } + time.Sleep(time.Millisecond) + } + + closeReturned := make(chan struct{}) + go func() { + d.close() // external goroutine: must wait for the loop + close(closeReturned) + }() + + select { + case <-closeReturned: + t.Fatal("external close() returned before the loop goroutine exited") + case <-time.After(100 * time.Millisecond): + } + close(release) // let the handler finish + select { + case <-closeReturned: + case <-time.After(2 * time.Second): + t.Fatal("external close() did not return after the loop drained") + } +} + +// TestQwpSfDispatcherCloseBoundedOnStuckHandler is a regression test +// for M15: a SenderErrorHandler that never returns must not make +// close() hang forever. The loop goroutine is parked inside deliver() +// and never calls wg.Done(); close() bounds its join by +// qwpSfDispatcherCloseJoinTimeout, abandons the wedged goroutine, and +// returns. Notifications it could not deliver are counted as dropped. +func TestQwpSfDispatcherCloseBoundedOnStuckHandler(t *testing.T) { + block := make(chan struct{}) + defer close(block) // release the wedged goroutine at test end + var inHandler atomic.Bool + d := newQwpSfErrorDispatcher(func(e *SenderError) { + inHandler.Store(true) + <-block // never returns until the test ends + }, 4) + + // First offer lazy-starts the loop and pins it in the handler. + if !d.offer(&SenderError{Category: CategoryParseError, ToFsn: 0}) { + t.Fatal("offer rejected on a fresh dispatcher") + } + deadline := time.Now().Add(2 * time.Second) + for !inHandler.Load() { + if time.Now().After(deadline) { + t.Fatal("handler never invoked") + } + time.Sleep(time.Millisecond) + } + // Queue more behind the wedged handler so close() has items to + // account as dropped (capacity is 4, so these three never overflow + // on the way in). + for i := 1; i <= 3; i++ { + if !d.offer(&SenderError{Category: CategoryParseError, ToFsn: int64(i)}) { + t.Fatalf("offer %d rejected on a non-full inbox", i) + } + } + + closeReturned := make(chan struct{}) + start := time.Now() + go func() { + d.close() + close(closeReturned) + }() + select { + case <-closeReturned: + case <-time.After(qwpSfDispatcherCloseJoinTimeout + 2*time.Second): + t.Fatal("close() hung on a never-returning handler") + } + // Must have waited at least the join budget before abandoning — a + // near-instant return would mean the bound was skipped. + if elapsed := time.Since(start); elapsed < qwpSfDispatcherCloseJoinTimeout { + t.Errorf("close() returned after %s, want ≥ join budget %s", + elapsed, qwpSfDispatcherCloseJoinTimeout) + } + // The three queued-but-undelivered items were abandoned as dropped. + if got := d.droppedNotifications(); got != 3 { + t.Errorf("dropped = %d, want 3 (queued items abandoned at bounded close)", got) + } +} diff --git a/qwp_sf_drainer.go b/qwp_sf_drainer.go new file mode 100644 index 00000000..8fe970d3 --- /dev/null +++ b/qwp_sf_drainer.go @@ -0,0 +1,506 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "log" + "strings" + "sync" + "sync/atomic" + "time" +) + +// qwpSfDrainOutcome is the terminal state of a drainer's run. +type qwpSfDrainOutcome int32 + +const ( + qwpSfDrainOutcomePending qwpSfDrainOutcome = iota + qwpSfDrainOutcomeLockedByOther + qwpSfDrainOutcomeSuccess + qwpSfDrainOutcomeFailed + qwpSfDrainOutcomeStopped +) + +// qwpSfDrainerPollInterval is how often the drainer wakes to +// re-check whether the slot is fully drained. +const qwpSfDrainerPollInterval = 50 * time.Millisecond + +// qwpSfDrainerPoolCloseGrace bounds how long the pool's close() +// waits for active drainers to exit cleanly before cancelling the +// pool's master ctx to forcibly unwind blocking dials. Mirrors the +// Java 3-second grace. var (not const) so package tests can dial +// it down without paying the full 3 s. +var qwpSfDrainerPoolCloseGrace = 3 * time.Second + +// qwpSfDrainerPoolHardCloseGrace bounds how long the pool's close() +// waits AFTER cancelling the master ctx. Cancellation unwinds +// ctx-aware blocking (TCP dials, the drainer poll loop); a drainer +// still alive past this second grace is wedged in I/O the ctx cannot +// reach — drainerRun's engine-open phase (flock, mmap, full CRC scan +// of a possibly-huge slot, hung NFS) makes no ctx checks. Such a +// drainer is abandoned rather than blocking close() on un-cancellable +// I/O; the slot it holds stays a valid orphan for a future sender to +// re-adopt. var (not const) so package tests can dial it down. +var qwpSfDrainerPoolHardCloseGrace = 1 * time.Second + +// qwpSfOrphanDrainer empties one orphan slot and exits. Owned by +// qwpSfDrainerPool; one instance per slot. +// +// Lifecycle: +// 1. Open a cursor engine on the slot — recovery picks up every +// .sfa file already on disk. The engine itself acquires the +// slot lock; if it's held by someone else we exit silently. +// 2. Open a fresh transport via the supplied factory (separate +// connection from the foreground sender). +// 3. Run a send loop until ackedFsn catches up to the snapshot of +// publishedFsn taken at startup. +// 4. Close everything in reverse order; release the lock. +// +// On terminal failure (auth-rejection, reconnect-budget exhaustion, +// recovery error), the drainer drops a .failed sentinel into the +// slot before exiting. Future scans skip the slot until an operator +// clears the sentinel — bounded automatic retry, then human-in- +// the-loop. +type qwpSfOrphanDrainer struct { + slotPath string + segmentSize int64 + sfMaxTotalBytes int64 + clientFactory qwpSfReconnectFactory + // tracker is the shared host-health tracker. When non-nil, the + // drainer participates in the same failover.md §2 model the + // foreground SF loop uses: PickNext observations from one loop + // inform the next. Each drainer's send loop owns a private + // previousIdx slot on the shared tracker per §2.3, so mid-stream + // demotions don't corrupt foreground's bookkeeping (or each + // other's). nil = synthesized 1-host implicit tracker (legacy + // single-host tests). + tracker *qwpHostTracker + reconnectMaxDuration time.Duration + reconnectInitialBackoff time.Duration + reconnectMaxBackoff time.Duration + stopRequested atomic.Bool + targetFsn atomic.Int64 // -1 until startup observes publishedFsn + ackedFsn atomic.Int64 // mirrors engine.ackedFsn for visibility + outcome atomic.Int32 + lastErrorMessage atomic.Pointer[string] +} + +// qwpSfNewOrphanDrainer constructs a drainer for the given slot. +// All knobs are required; pool defaults are not applied here so +// the caller (the drainer pool) can pass through user-configured +// values verbatim. +// +// tracker is the shared foreground host-health tracker (failover.md +// §2). Pass nil for legacy single-host tests; the drainer +// synthesizes a 1-host implicit tracker internally in that case. +func qwpSfNewOrphanDrainer( + slotPath string, + segmentSize, sfMaxTotalBytes int64, + clientFactory qwpSfReconnectFactory, + tracker *qwpHostTracker, + reconnectMaxDuration, reconnectInitialBackoff, reconnectMaxBackoff time.Duration, +) *qwpSfOrphanDrainer { + d := &qwpSfOrphanDrainer{ + slotPath: slotPath, + segmentSize: segmentSize, + sfMaxTotalBytes: sfMaxTotalBytes, + clientFactory: clientFactory, + tracker: tracker, + reconnectMaxDuration: reconnectMaxDuration, + reconnectInitialBackoff: reconnectInitialBackoff, + reconnectMaxBackoff: reconnectMaxBackoff, + } + d.targetFsn.Store(-1) + d.ackedFsn.Store(-1) + d.outcome.Store(int32(qwpSfDrainOutcomePending)) + return d +} + +// drainerOutcome returns the terminal state of the drainer's run, +// or qwpSfDrainOutcomePending while it's still running. +func (d *qwpSfOrphanDrainer) drainerOutcome() qwpSfDrainOutcome { + return qwpSfDrainOutcome(d.outcome.Load()) +} + +// drainerSlotPath returns the absolute path of the orphan slot +// the drainer adopted. +func (d *qwpSfOrphanDrainer) drainerSlotPath() string { + return d.slotPath +} + +// drainerLastError returns the latest error string the drainer +// recorded, or "" if no error has been recorded. +func (d *qwpSfOrphanDrainer) drainerLastError() string { + if p := d.lastErrorMessage.Load(); p != nil { + return *p + } + return "" +} + +// drainerTargetFsn returns the publishedFsn snapshot taken at +// startup, or -1 if the drainer hasn't started yet. +func (d *qwpSfOrphanDrainer) drainerTargetFsn() int64 { + return d.targetFsn.Load() +} + +// drainerAckedFsn returns the latest known ackedFsn for the slot. +func (d *qwpSfOrphanDrainer) drainerAckedFsn() int64 { + return d.ackedFsn.Load() +} + +// drainerRequestStop politely asks the drainer to exit at its next +// poll. Used by the pool's close path; drainers ALSO exit on their +// own when the slot fully drains. +func (d *qwpSfOrphanDrainer) drainerRequestStop() { + d.stopRequested.Store(true) +} + +func (d *qwpSfOrphanDrainer) recordFailure(reason string) { + d.lastErrorMessage.Store(&reason) + qwpSfMarkSlotFailed(d.slotPath, reason) + d.outcome.Store(int32(qwpSfDrainOutcomeFailed)) +} + +// drainerRun is the drainer goroutine entry point. Runs to +// completion (or terminal failure), then sets outcome and exits. +func (d *qwpSfOrphanDrainer) drainerRun(ctx context.Context) { + engine, err := qwpSfNewCursorEngine(d.slotPath, d.segmentSize, d.sfMaxTotalBytes, qwpSfEngineDefaultAppendDeadline) + if err != nil { + // Lock contention is expected (a sibling drainer or the + // foreground sender holds it) — exit silently, no .failed. + if errors.Is(err, qwpSfErrLockBusy) || strings.Contains(err.Error(), "slot already in use") { + d.outcome.Store(int32(qwpSfDrainOutcomeLockedByOther)) + return + } + // Recovery / disk error — surface as failure with sentinel. + msg := err.Error() + d.lastErrorMessage.Store(&msg) + qwpSfMarkSlotFailed(d.slotPath, "engine open: "+msg) + d.outcome.Store(int32(qwpSfDrainOutcomeFailed)) + return + } + defer func() { _ = engine.engineClose() }() + + target := engine.enginePublishedFsn() + d.targetFsn.Store(target) + if engine.engineAckedFsn() >= target { + // Slot is already drained — engineClose will unlink residual + // .sfa files in its own logic. + d.outcome.Store(int32(qwpSfDrainOutcomeSuccess)) + return + } + // Initial connect via the round-walk so the drainer immediately + // honours classifications the foreground tracker has already + // observed (e.g. host 0 is currently TopologyReject — start at + // host 1 instead). When d.tracker is nil, qwpSfConnectWithRetry + // synthesises a 1-host implicit tracker, matching the legacy + // behaviour single-host tests rely on. + transport, boundIdx, err := qwpSfConnectWithRetry(ctx, d.clientFactory, d.tracker, + d.reconnectMaxDuration, d.reconnectInitialBackoff, d.reconnectMaxBackoff) + if err != nil { + // Pool close (or caller cancellation) during the dial: + // don't drop a .failed sentinel — the slot is still + // drainable on a future sender start. + if ctx.Err() != nil || d.stopRequested.Load() { + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + } + msg := err.Error() + d.recordFailure("initial connect: " + msg) + return + } + loop := qwpSfNewSendLoop(engine, transport, d.clientFactory, + qwpSfDefaultParkInterval, + d.reconnectMaxDuration, d.reconnectInitialBackoff, d.reconnectMaxBackoff) + // Share the foreground tracker; the loop carries its OWN + // previousIdx slot (failover.md §2.3 "per-caller previousIdx, + // not shared") so a mid-stream demote here doesn't corrupt + // foreground's bookkeeping. + loop.sendLoopSetHostTracker(d.tracker, boundIdx) + engine.engineSetReconnectStatusGetter(loop.sendLoopReconnectStatus) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + timer := time.NewTicker(qwpSfDrainerPollInterval) + defer timer.Stop() + // No-progress watchdog. A server that completes the WS upgrade + // and accepts our frames but never ACKs and never drops the + // connection (wedged server, black-hole proxy, or a silently + // incompatible build that holds the socket open) keeps acked + // below target forever while sendLoopCheckError stays nil — the + // run()-level "frames sent, zero acks → terminal" heuristic only + // fires after the connection drops, which by definition never + // happens here. Without a bound the drainer spins on the poll + // interval forever and, on Close, exits Stopped (no .failed + // sentinel), so every future process start re-adopts the same + // wedged slot in full — an unbounded re-adoption livelock. + // + // Bound it with the same reconnectMaxDuration budget that bounds + // the connect round-walk (this mirrors the Java drainer's + // connect-phase deadline semantics — "give the cluster a budget + // to settle before quarantining the slot"): if acked makes no + // forward progress for that long while we are NOT inside a + // (separately bounded) reconnect, drop a .failed sentinel so the + // design's "bounded automatic retry, then human-in-the-loop" + // promise holds. A reconnect exhausting its own budget still + // surfaces ahead of this via sendLoopCheckError. + noProgressBudget := d.reconnectMaxDuration + if noProgressBudget <= 0 { + noProgressBudget = qwpSfDefaultReconnectMaxDuration + } + lastProgressAcked := engine.engineAckedFsn() + lastProgressAt := time.Now() + for { + acked := engine.engineAckedFsn() + d.ackedFsn.Store(acked) + if acked >= target { + d.outcome.Store(int32(qwpSfDrainOutcomeSuccess)) + return + } + if err := loop.sendLoopCheckError(); err != nil { + d.recordFailure("wire: " + err.Error()) + return + } + if d.stopRequested.Load() { + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + } + // Forward ACK progress, or being inside the separately + // bounded reconnect loop, resets the watchdog clock. A fresh + // connection thus always gets a full budget to produce its + // first ACK. + now := time.Now() + reconnecting, _, _ := loop.sendLoopReconnectStatus() + switch { + case acked > lastProgressAcked || reconnecting: + lastProgressAcked = acked + lastProgressAt = now + case now.Sub(lastProgressAt) >= noProgressBudget: + d.recordFailure(fmt.Sprintf( + "no drain progress: ackedFsn stuck at %d (target %d) for %s "+ + "on a live connection — server accepted frames but is not "+ + "ACKing (wedged server or incompatible build)", + acked, target, now.Sub(lastProgressAt))) + return + } + select { + case <-ctx.Done(): + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + case <-timer.C: + } + } +} + +// qwpSfDrainerPool is a bounded thread pool that runs orphan +// drainer tasks. One pool per foreground sender; size capped by +// max_background_drainers. +// +// Each drainer gets its own goroutine, throttled by a buffered +// semaphore channel. Idle pool (no orphans submitted) costs zero +// goroutines. Closing the pool requests every still-running +// drainer to stop and waits up to qwpSfDrainerPoolCloseGrace for +// them to exit cleanly; if any drainer is still alive after the +// grace (typically blocked in a TCP dial / WS upgrade), the pool +// cancels its master context so blocking I/O unwinds, then waits a +// further qwpSfDrainerPoolHardCloseGrace. A drainer wedged in +// un-cancellable I/O past that bound is abandoned (with a logged +// count) so close() never hangs. +type qwpSfDrainerPool struct { + maxConcurrent int + sem chan struct{} + closed atomic.Bool + wg sync.WaitGroup + + // ctx is the master context handed to every drainerRun call. + // Cancelled in drainerPoolClose so dials and other ctx-aware + // blocking calls unwind. Independent of the caller's setup + // ctx — drainers are long-lived and must outlive whatever + // transient ctx was used to construct the parent sender. + ctx context.Context + cancel context.CancelFunc + + mu sync.Mutex + active []*qwpSfOrphanDrainer +} + +// qwpSfNewDrainerPool constructs a pool with the given concurrency +// cap. Panics on a non-positive cap. +func qwpSfNewDrainerPool(maxConcurrent int) *qwpSfDrainerPool { + if maxConcurrent <= 0 { + panic("qwp/sf: maxConcurrent must be > 0") + } + ctx, cancel := context.WithCancel(context.Background()) + return &qwpSfDrainerPool{ + maxConcurrent: maxConcurrent, + sem: make(chan struct{}, maxConcurrent), + ctx: ctx, + cancel: cancel, + } +} + +// drainerPoolSubmit launches the drainer in a managed goroutine. +// Returns an error if the pool has been closed. +// +// Drainers queue when the concurrency cap is reached: the +// goroutine takes a slot on the semaphore and proceeds. The +// caller's ctx only gates the semaphore wait — once the drainer +// is running, it observes the pool's master ctx instead, so +// drainers outlive the caller's (typically setup-only) ctx. +func (p *qwpSfDrainerPool) drainerPoolSubmit(ctx context.Context, d *qwpSfOrphanDrainer) error { + if p.closed.Load() { + return errors.New("qwp/sf: drainer pool closed") + } + p.mu.Lock() + if p.closed.Load() { + p.mu.Unlock() + return errors.New("qwp/sf: drainer pool closed") + } + p.active = append(p.active, d) + p.wg.Add(1) + p.mu.Unlock() + go func() { + defer p.wg.Done() + defer p.removeActive(d) + // Wait for a slot. The caller's ctx unblocks if the user + // gives up on setup; the pool's ctx unblocks on close. + select { + case p.sem <- struct{}{}: + case <-ctx.Done(): + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + case <-p.ctx.Done(): + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + } + defer func() { <-p.sem }() + if p.closed.Load() { + d.outcome.Store(int32(qwpSfDrainOutcomeStopped)) + return + } + // Use the pool's ctx so the drainer is detached from the + // caller's setup ctx (its expected lifetime is far longer) + // but is forcibly cancellable when the pool is closing. + d.drainerRun(p.ctx) + }() + return nil +} + +// removeActive unlinks d from the active list when its goroutine +// exits. Called from a defer in drainerPoolSubmit's worker. +func (p *qwpSfDrainerPool) removeActive(d *qwpSfOrphanDrainer) { + p.mu.Lock() + defer p.mu.Unlock() + for i, x := range p.active { + if x == d { + n := len(p.active) + p.active[i] = p.active[n-1] + p.active[n-1] = nil + p.active = p.active[:n-1] + return + } + } +} + +// drainerPoolSnapshot returns a copy of the drainers currently +// running (or queued on the semaphore). Drainers that have run +// to completion are pruned. Useful for status accessors. +func (p *qwpSfDrainerPool) drainerPoolSnapshot() []*qwpSfOrphanDrainer { + p.mu.Lock() + defer p.mu.Unlock() + out := make([]*qwpSfOrphanDrainer, len(p.active)) + copy(out, p.active) + return out +} + +// activeCount returns the number of drainers still tracked as +// running or queued. drainerPoolClose reports it as the count of +// drainers abandoned at the hard-grace boundary. +func (p *qwpSfDrainerPool) activeCount() int { + p.mu.Lock() + defer p.mu.Unlock() + return len(p.active) +} + +// drainerPoolClose stops the pool. Sets closed=true so new submits +// fail; requests a polite stop on every tracked drainer; waits up +// to qwpSfDrainerPoolCloseGrace. If any drainer is still alive at +// the grace boundary it is most likely parked in a TCP dial / WS +// upgrade — cancel the master ctx to unwind those blocking calls, +// then wait a further qwpSfDrainerPoolHardCloseGrace. A drainer +// still running past that bound is wedged in I/O the ctx cannot +// reach (engine-open flock / mmap / CRC scan / hung NFS); it is +// abandoned with a logged count rather than hanging close() — its +// slot stays a valid orphan for a future sender. Idempotent. +func (p *qwpSfDrainerPool) drainerPoolClose() { + if !p.closed.CompareAndSwap(false, true) { + return + } + p.mu.Lock() + for _, d := range p.active { + d.drainerRequestStop() + } + p.mu.Unlock() + doneCh := make(chan struct{}) + go func() { + p.wg.Wait() + close(doneCh) + }() + graceTimer := time.NewTimer(qwpSfDrainerPoolCloseGrace) + defer graceTimer.Stop() + select { + case <-doneCh: + // Every drainer exited within the polite grace. + case <-graceTimer.C: + // A drainer outlived the polite grace — most likely parked in + // a TCP dial / WS upgrade. Cancel the master ctx to unwind + // those ctx-aware blocking calls, then wait a bounded second + // grace. + p.cancel() + hardTimer := time.NewTimer(qwpSfDrainerPoolHardCloseGrace) + defer hardTimer.Stop() + select { + case <-doneCh: + // Cancellation unwound the straggler(s). + case <-hardTimer.C: + // A drainer is wedged in I/O the ctx cannot reach + // (engine-open flock / mmap / CRC scan / hung NFS). + // Abandon it: its goroutine lives until the syscall + // returns, but close() must not block on un-cancellable + // I/O. The slot it holds stays a valid orphan a future + // sender re-adopts. Surface the abandoned count for ops. + log.Printf("[WARN] qwp/sf: %d orphan drainer(s) still running %s "+ + "after close; abandoning (wedged in un-cancellable disk I/O). "+ + "Their slots remain adoptable on a future sender start.", + p.activeCount(), qwpSfDrainerPoolCloseGrace+qwpSfDrainerPoolHardCloseGrace) + } + } + // Release the master ctx even on the clean-exit path so the + // underlying timer goroutine doesn't linger. + p.cancel() +} diff --git a/qwp_sf_engine.go b/qwp_sf_engine.go new file mode 100644 index 00000000..073e7f52 --- /dev/null +++ b/qwp_sf_engine.go @@ -0,0 +1,631 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "time" +) + +// qwpSfEngineDefaultAppendDeadline is the default backpressure +// deadline for appendBlocking. Mirrors Java's +// CursorSendEngine.DEFAULT_APPEND_DEADLINE_NANOS = 30s. +const qwpSfEngineDefaultAppendDeadline = 30 * time.Second + +// qwpSfEngineParkInterval is how long appendBlocking sleeps between +// retries while waiting for the manager to free space. Mirrors +// Java's 50µs LockSupport.parkNanos. +const qwpSfEngineParkInterval = 50 * time.Microsecond + +// ErrBackpressureTimeout is the sentinel a producer call +// (At / AtNow / Flush / FlushAndGetSequence) wraps when the +// store-and-forward append deadline (WithSfAppendDeadline / +// sf_append_deadline_millis) expires before the cursor engine frees +// space. The wire path is not draining — the server is slow or +// disconnected, or sf_max_total_bytes is too small. Match it with +// errors.Is; the wrapped error carries the deadline and reconnect +// diagnostics in its message. +var ErrBackpressureTimeout = errors.New( + "qwp/sf: cursor ring backpressured — wire path is not draining (server slow / disconnected, or sf_max_total_bytes too small)") + +// qwpSfErrEngineClosed is returned by engineAppendBlocking when the +// engine is closed underneath an in-flight or backpressure-parked +// append. The canonical trigger is a SenderErrorHandler calling +// Close() while the producer is stalled in the backpressure spin on a +// wedged wire (a HALT stops the send loop draining, so ackedFsn never +// advances and the ring stays full). The producer gets this clean +// error instead of dereferencing a segment that engineClose's +// segmentRingClose has just nil'd + munmapped. +// +//lint:ignore ST1012 prefix kept for grouping with other qwpSf* errors +var qwpSfErrEngineClosed = errors.New("qwp/sf: cursor engine closed") + +// qwpSfCursorEngine is the cursor-engine facade that bundles a +// qwpSfSegmentRing with a qwpSfSegmentManager and exposes the +// user-facing API the wire-send loop calls into. Keeps SF append +// work on the user goroutine (where it belongs) and segment +// lifecycle work on the manager goroutine (where it belongs). +// +// Responsibilities: +// - Owning the ring + manager lifecycle (open / close / startup +// recovery). +// - Providing a user-thread append path that handles backpressure. +// - Exposing read accessors for the I/O thread: +// enginePublishedFsn, engineActiveSegment, engineSealedSegments. +// - Routing server ACKs to the ring for trim. +// +// Not in scope: +// - Multi-producer support. Single producer (one user goroutine) +// only. +type qwpSfCursorEngine struct { + sfDir string + segmentSizeBytes int64 + + manager *qwpSfSegmentManager + ownsManager bool + slotLock *qwpSfSlotLock + ring *qwpSfSegmentRing + + // watermark is the engine-owned mmap'd .ack-watermark file + // (sf-client.md §5.4). nil in memory mode and when the file + // could not be opened (recovery then falls back to the + // segment-derived lowestBase-1 seed). Lifetime is tied to the + // engine: opened in the constructor after the slot lock is + // acquired, read once to refine the recovery seed, written + // through by the segment manager on every tick where ackedFsn + // advanced, closed in engineClose AFTER the manager (the sole + // writer) is gone. + watermark *qwpSfAckWatermark + + appendDeadline time.Duration + + // recoveredFromDisk is true when the constructor recovered an + // existing on-disk slot rather than starting fresh. Diagnostic + // accessor for tests and observability; cursor frames are + // self-sufficient (every frame carries full schema + full + // symbol-dict delta), so producer-side schema reset on recovery + // is not required at the engine level. + recoveredFromDisk bool + + // backpressureStalls counts how many times appendBlocking + // observed qwpSfBackpressureNoSpare on its first try and had to + // wait. One increment per blocking-call (not per spin). + backpressureStalls atomic.Int64 + + // reconnectStatus is the (optional) snapshot getter wired in by + // the I/O send loop after it is constructed. When nil (e.g. tests + // using the engine standalone) the backpressure-timeout error + // falls back to the loop-agnostic "wire path is not draining" + // wording. When non-nil, engineAppendBlocking checks it on + // deadline expiry to distinguish "publishing but slow" from + // "reconnecting" per spec §16, and includes attempt count + + // outage elapsed in the latter case. + reconnectStatus atomic.Pointer[func() (bool, int64, time.Time)] + + // closed is set by engineClose. atomic.Bool so tests / status + // accessors can sample it from any goroutine. + closed atomic.Bool + + // appendMu serializes the producer's ring-append path against + // engineClose's segment teardown. The producer's only entry into + // appendOrFsn is engineAppendBlocking, which takes this lock around + // each ring touch (initial try and every backpressure-spin retry) + // and re-checks closed under it; engineClose holds it across the + // manager + ring teardown. Together they guarantee no append is + // dereferencing the active segment while segmentRingClose nil's and + // munmaps it, and that every append after close observes closed and + // bails with qwpSfErrEngineClosed. Without it a Close() from a + // SenderErrorHandler (running on the dispatcher goroutine) while the + // producer is parked in the backpressure spin tears the segment down + // under the producer — a nil-pointer deref in memory mode, a SIGBUS + // on the munmapped pages in SF mode. Off the per-row hot path: + // appendOrFsn runs once per flush, not per row. + appendMu sync.Mutex +} + +// qwpSfNewCursorEngine creates an engine with a private +// qwpSfSegmentManager (owned by the engine, closed alongside it). +// Pass sfDir = "" for memory-mode (no disk involvement); a non-empty +// sfDir places the engine in store-and-forward mode against that +// slot directory. +// +// Returns an error if the slot lock can't be acquired (another +// process is using the slot), or if recovery encounters an +// inconsistent on-disk state. +func qwpSfNewCursorEngine(sfDir string, segmentSizeBytes, maxTotalBytes int64, appendDeadline time.Duration) (*qwpSfCursorEngine, error) { + mgr, err := qwpSfNewSegmentManager(segmentSizeBytes, qwpSfManagerDefaultPoll, maxTotalBytes) + if err != nil { + return nil, err + } + mgr.segmentManagerStart() + e, err := qwpSfNewCursorEngineWithManager(sfDir, segmentSizeBytes, mgr, appendDeadline) + if err != nil { + mgr.segmentManagerClose() + return nil, err + } + e.ownsManager = true + return e, nil +} + +// qwpSfNewCursorEngineWithManager creates an engine that shares the +// given segment manager (must already be started). The caller +// retains ownership of the manager; engineClose will not stop it. +func qwpSfNewCursorEngineWithManager(sfDir string, segmentSizeBytes int64, mgr *qwpSfSegmentManager, appendDeadline time.Duration) (*qwpSfCursorEngine, error) { + if appendDeadline <= 0 { + appendDeadline = qwpSfEngineDefaultAppendDeadline + } + memoryMode := sfDir == "" + var ( + lock *qwpSfSlotLock + ring *qwpSfSegmentRing + watermark *qwpSfAckWatermark + recoveredFromDisk bool + err error + ) + if !memoryMode { + // Acquire the slot lock BEFORE touching any *.sfa files. + // Two engines pointed at the same slot would otherwise race + // on recovery and create overlapping FSN ranges. + lock, err = qwpSfAcquireSlotLock(sfDir) + if err != nil { + return nil, err + } + } + // Release order on any failure mirrors the Java reference: the + // watermark (its own mmap + fd) is dropped before the slot lock, + // so the kernel-held flock outlives every other cleanup. + cleanup := func() { + if watermark != nil { + _ = watermark.close() + } + if lock != nil { + _ = lock.close() + } + } + // Disk mode: try to recover any *.sfa files left behind by a + // prior session before deciding to start fresh. Without this the + // engine would create a new sf-initial.sfa at baseSeq=0, + // overlapping FSNs already on disk and corrupting ACK + // translation, trim, and replay. + if !memoryMode { + ring, err = qwpSfOpenRing(sfDir, segmentSizeBytes) + if err != nil { + cleanup() + return nil, err + } + recoveredFromDisk = ring != nil + if ring != nil { + // Seed ackedFsn to one below the lowest segment's baseSeq. + // We don't know what was actually acked before the prior + // session crashed, but anything trimmed off the ring's + // bottom must have been acked (trim is ack-driven). + // Without this seed, ackedFsn stays at -1 and the I/O + // loop's start-time positioning would walk to FSN 0 — + // which may not exist on disk if earlier segments have + // been trimmed, causing it to fall through to the active + // segment's tip and skip the unacked sealed segments + // entirely. + first := ring.firstSealed() + lowest := int64(0) + if first != nil { + lowest = first.segmentBaseSeq() + } else if a := ring.getActiveSegment(); a != nil { + lowest = a.segmentBaseSeq() + } + baseSeed := lowest - 1 + // Refine the seed with the persisted ack watermark + // (sf-client.md §5.4 / §6.5 / §18.1). It may carry + // durable-acks the previous sender — or another client + // whose orphan slot this drainer adopted — received for + // frames inside the lowest surviving sealed segment. + // Without honouring it those frames get re-replayed on a + // fresh connection, producing row-level duplicates against + // a still-alive server unless the table dedupes. + // + // max(watermark, lowestBase-1) absorbs both orderings of + // the manager's "persist then trim" tick: + // - persist crashed before trim: segments still on disk + // are >= lowest, watermark is correct; max picks it. + // - trim ran before persist: those segments are gone so + // lowestBase is higher, watermark is stale; max picks + // lowestBase-1. + // + // open() returns nil on any setup failure so a missing / + // unmappable file never takes the engine down — we just + // fall back to the bare lowestBase-1 seed. + watermark = qwpSfAckWatermarkOpen(sfDir) + watermarkFsn := watermark.read() // nil-safe → INVALID + candidate := baseSeed + if watermarkFsn > candidate { + candidate = watermarkFsn + } + // Reject a watermark past publishedFsn: a correctly + // operating prior session cannot produce one, so an + // excess value is corruption (torn write on a non-atomic + // FS, bit-rot, manual edit). Trusting it would seed + // ackedFsn = publishedFsn after the ring's own clamp and + // position the cursor past every un-acked frame — silent + // loss of the un-acked tail. Fall back to the + // segment-derived seed so that tail still replays. + seed := candidate + if seed > ring.segmentRingPublishedFsn() { + seed = baseSeed + } + if seed >= 0 { + ring.acknowledge(seed) + } + } + } + if ring == nil { + var initial *qwpSfSegment + var initialPath string + if memoryMode { + initial, err = qwpSfCreateInMemorySegment(0, segmentSizeBytes) + } else { + // Fresh disk slot: any stale watermark refers to a + // fully-drained lifecycle now gone. Unlink it before + // opening so the new session's first read() correctly + // reports INVALID (magic=0 on a freshly zero-filled + // file) rather than honouring an FSN with no segments + // behind it. + qwpSfAckWatermarkRemoveOrphan(sfDir) + watermark = qwpSfAckWatermarkOpen(sfDir) + initialPath = filepath.Join(sfDir, "sf-initial.sfa") + initial, err = qwpSfCreateSegment(initialPath, 0, segmentSizeBytes) + } + if err != nil { + cleanup() + return nil, err + } + ring = qwpSfNewSegmentRing(initial, segmentSizeBytes) + } + if err := mgr.segmentManagerRegisterWithWatermark(ring, sfDir, watermark); err != nil { + _ = ring.segmentRingClose() + cleanup() + return nil, err + } + e := &qwpSfCursorEngine{ + sfDir: sfDir, + segmentSizeBytes: segmentSizeBytes, + manager: mgr, + ownsManager: false, + slotLock: lock, + ring: ring, + watermark: watermark, + appendDeadline: appendDeadline, + recoveredFromDisk: recoveredFromDisk, + } + return e, nil +} + +// engineAcknowledge records a server ACK for cumulative FSN seq. +// Triggers background trim of any sealed segments whose every frame +// is now acknowledged. Idempotent and monotonic. +func (e *qwpSfCursorEngine) engineAcknowledge(seq int64) { + e.ring.acknowledge(seq) +} + +// engineAckedFsn returns the highest FSN safe to send. +func (e *qwpSfCursorEngine) engineAckedFsn() int64 { + return e.ring.segmentRingAckedFsn() +} + +// engineAckNotify returns a channel closed the next time ackedFsn +// advances. Lets AwaitAckedFsn block until a server ACK lands instead +// of polling. See qwpSfSegmentRing.segmentRingAckNotify for the +// subscribe-then-sample ordering callers must follow. +func (e *qwpSfCursorEngine) engineAckNotify() <-chan struct{} { + return e.ring.segmentRingAckNotify() +} + +// engineActiveSegment returns the current active mmap'd segment. +// I/O thread accessor. +func (e *qwpSfCursorEngine) engineActiveSegment() *qwpSfSegment { + return e.ring.getActiveSegment() +} + +// engineSfDir returns the slot directory ("" for memory-mode). +func (e *qwpSfCursorEngine) engineSfDir() string { + return e.sfDir +} + +// engineMaxFrameBytes returns the largest frame payload a single +// segment can hold: the segment size minus the file header and the +// per-frame header. A payload above this can never be appended — +// appendOrFsn returns qwpSfPayloadTooLarge for it even against a +// freshly-rotated spare — so the producer uses this bound to (a) +// clamp its byte-size auto-flush trigger and (b) drop, rather than +// retain, an oversize batch at the flush boundary. Kept here so it +// tracks the segment header layout automatically and cannot drift +// from what tryAppend actually enforces. +func (e *qwpSfCursorEngine) engineMaxFrameBytes() int64 { + return e.segmentSizeBytes - qwpSfHeaderSize - qwpSfFrameHeaderSize +} + +// engineWasRecoveredFromDisk reports whether the engine opened +// against a pre-existing on-disk slot. Memory-mode engines and +// fresh-disk engines return false. +func (e *qwpSfCursorEngine) engineWasRecoveredFromDisk() bool { + return e.recoveredFromDisk +} + +// enginePublishedFsn returns the highest FSN whose frame is fully +// written and visible to consumers (the I/O thread). -1 when nothing +// has been appended yet. +func (e *qwpSfCursorEngine) enginePublishedFsn() int64 { + return e.ring.segmentRingPublishedFsn() +} + +// engineNextSealedAfter walks one step forward in the sealed list. +func (e *qwpSfCursorEngine) engineNextSealedAfter(current *qwpSfSegment) *qwpSfSegment { + return e.ring.nextSealedAfter(current) +} + +// engineFirstSealed returns the oldest sealed segment, or nil. +func (e *qwpSfCursorEngine) engineFirstSealed() *qwpSfSegment { + return e.ring.firstSealed() +} + +// engineFindSegmentContaining returns the segment whose published +// frame range covers fsn, or nil. Used by the reconnect path to +// position the I/O thread's cursor at the first unacked frame. +func (e *qwpSfCursorEngine) engineFindSegmentContaining(fsn int64) *qwpSfSegment { + return e.ring.findSegmentContaining(fsn) +} + +// engineAppendBlocking appends payload, blocking up to the +// configured deadline when the cursor ring is at its memory/disk cap +// and waiting for ACK-driven trim to free space. Returns the +// assigned FSN on success. +// +// ctx is honoured during the backpressure spin: a cancelled or +// deadline-expired ctx returns ctx.Err() immediately, so callers +// passing a tighter deadline than e.appendDeadline get their +// deadline respected. +// +// Backpressure is surfaced two ways: +// - engineTotalBackpressureStalls() counter — incremented once per +// blocking-call that had to wait for the manager. +// - The error from a deadline expiry distinguishes "wire path is +// wedged" from a genuine over-large payload. +func (e *qwpSfCursorEngine) engineAppendBlocking(ctx context.Context, payload []byte) (int64, error) { + if err := ctx.Err(); err != nil { + return 0, err + } + fsn, closed := e.tryAppendOrFsn(payload) + if closed { + return 0, qwpSfErrEngineClosed + } + if fsn >= 0 { + return fsn, nil + } + if fsn == qwpSfPayloadTooLarge { + return 0, qwpSfErrPayloadTooLarge + } + // First miss → record one stall (not one per spin) and start the + // deadline clock. + e.backpressureStalls.Add(1) + deadline := time.Now().Add(e.appendDeadline) + timer := time.NewTimer(qwpSfEngineParkInterval) + defer timer.Stop() + for { + if time.Now().After(deadline) { + return 0, e.formatBackpressureTimeout() + } + select { + case <-timer.C: + case <-ctx.Done(): + return 0, ctx.Err() + } + timer.Reset(qwpSfEngineParkInterval) + fsn, closed = e.tryAppendOrFsn(payload) + if closed { + return 0, qwpSfErrEngineClosed + } + if fsn >= 0 { + return fsn, nil + } + if fsn == qwpSfPayloadTooLarge { + return 0, qwpSfErrPayloadTooLarge + } + } +} + +// tryAppendOrFsn runs one ring.appendOrFsn under appendMu, re-checking +// closed first so a concurrent engineClose can never tear the active +// segment down mid-append. Returns (fsn, false) with the appendOrFsn +// sentinel/result, or (0, true) when the engine has been closed — the +// signal engineAppendBlocking turns into qwpSfErrEngineClosed so a +// parked producer unwinds cleanly instead of dereferencing a nil'd / +// munmapped segment. Lock scope is exactly the ring touch; the spin's +// park happens with the lock released so engineClose is never delayed +// by more than one in-flight append. +func (e *qwpSfCursorEngine) tryAppendOrFsn(payload []byte) (fsn int64, closed bool) { + e.appendMu.Lock() + defer e.appendMu.Unlock() + if e.closed.Load() { + return 0, true + } + return e.ring.appendOrFsn(payload), false +} + +// engineTotalBackpressureStalls returns the cumulative number of +// times engineAppendBlocking had to wait for the manager to free +// space. One increment per blocking-call, not per spin-park. +func (e *qwpSfCursorEngine) engineTotalBackpressureStalls() int64 { + return e.backpressureStalls.Load() +} + +// engineSetReconnectStatusGetter wires a snapshot accessor that +// reports whether the I/O loop is currently inside its +// reconnect-with-backoff phase. Called once by the QWP sender +// constructor right after the send loop is created. Pass nil to +// detach (used by tests that tear down the loop independently). +// +// The getter is invoked only on the deadline-expiry path of +// engineAppendBlocking, so the cost is paid only on a true +// backpressure timeout — never on the steady-state hot path. +func (e *qwpSfCursorEngine) engineSetReconnectStatusGetter(getter func() (bool, int64, time.Time)) { + if getter == nil { + e.reconnectStatus.Store(nil) + return + } + e.reconnectStatus.Store(&getter) +} + +// engineSetSendLoopWakeup wires the producer→send-loop doorbell: +// appendOrFsn invokes fn after every publish so an idle send loop +// reacts immediately instead of polling at parkInterval. Called once +// by qwpSfNewSendLoop before producing starts. +func (e *qwpSfCursorEngine) engineSetSendLoopWakeup(fn func()) { + e.ring.setSendLoopWakeup(fn) +} + +// formatBackpressureTimeout builds the LineSenderException-equivalent +// error returned by engineAppendBlocking when the deadline expires. +// Per spec §16 the message MUST distinguish "publishing but slow" +// from "reconnecting"; in the latter case it includes the per-outage +// attempt count and the wall-clock outage start. +func (e *qwpSfCursorEngine) formatBackpressureTimeout() error { + if g := e.reconnectStatus.Load(); g != nil { + if reconnecting, attempts, outageStart := (*g)(); reconnecting { + return fmt.Errorf("%w (deadline %s, reconnecting: attempts=%d, outage-elapsed=%s, outage-start=%s)", + ErrBackpressureTimeout, + e.appendDeadline, + attempts, + time.Since(outageStart).Round(time.Millisecond), + outageStart.Format(time.RFC3339Nano)) + } + } + return fmt.Errorf("%w (deadline %s, wire publishing but slow)", ErrBackpressureTimeout, e.appendDeadline) +} + +// engineClose tears down the engine. Drains residual on-disk +// segment files when the ring confirms every published FSN has been +// acked — at that moment the slot has no recoverable work and the +// files are pure noise that would mislead the next sender's +// recovery. Best-effort: logs (via returned error) and continues on +// failures, since we're already on the close path. +// +// Order: deregister the ring from the manager (so no new spares +// arrive), close the manager if we own it, close the ring (closes +// its segments), close the ack-watermark mmap AFTER the manager (its +// sole writer) is gone, unlink residual files + the now-meaningless +// watermark if fully drained, release the slot lock LAST (so the +// kernel-held flock outlives any other cleanup work). +func (e *qwpSfCursorEngine) engineClose() error { + if !e.closed.CompareAndSwap(false, true) { + return nil + } + // Serialize the manager + ring teardown against the producer's + // append path. closed is now true, so any tryAppendOrFsn that + // acquires appendMu after us bails before touching the ring; + // acquiring it here drains any append currently in flight. Held + // across segmentRingClose so the active segment is nil'd + munmapped + // with no producer dereferencing it (C3: a SenderErrorHandler's + // Close() racing a producer parked in engineAppendBlocking's + // backpressure spin). appendMu is never held by the manager + // goroutine, so joining it under the lock cannot deadlock. + e.appendMu.Lock() + defer e.appendMu.Unlock() + // Capture drain state BEFORE closing the ring — once the ring is + // closed, its accessors aren't safe to read. The active segment + // is never trimmed by drainTrimmable (only sealed segments are), + // so when everything published has been acked we have to unlink + // the residual .sfa files here. + fullyDrained := e.sfDir != "" && + (e.ring.segmentRingPublishedFsn() < 0 || + e.ring.segmentRingAckedFsn() >= e.ring.segmentRingPublishedFsn()) + + var firstErr error + e.manager.segmentManagerDeregister(e.ring) + if e.ownsManager { + e.manager.segmentManagerClose() + } + if err := e.ring.segmentRingClose(); err != nil && firstErr == nil { + firstErr = err + } + // Close the watermark mmap/fd after the manager (the sole writer + // through it) is gone but before the slot lock is released. With + // ownsManager set, segmentManagerClose above has already joined + // the worker goroutine, so no persistIfAdvanced can race this + // close; the watermark's own mutex covers the residual + // shared-manager (test-only) case. + if e.watermark != nil { + if err := e.watermark.close(); err != nil && firstErr == nil { + firstErr = err + } + } + if fullyDrained { + if err := qwpSfUnlinkAllSegmentFiles(e.sfDir); err != nil && firstErr == nil { + firstErr = err + } + // A watermark with no segments behind it would only confuse + // the next session's recovery seed — drop it, matching the + // .sfa unlink and the fresh-slot removeOrphan above. + qwpSfAckWatermarkRemoveOrphan(e.sfDir) + } + if e.slotLock != nil { + if err := e.slotLock.close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// qwpSfUnlinkAllSegmentFiles unlinks every .sfa file under dir. +// Called only on clean shutdown when the ring confirms every +// published FSN has been acked. Best-effort: returns the first error +// encountered but continues iterating. +func qwpSfUnlinkAllSegmentFiles(dir string) error { + if _, err := os.Stat(dir); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + entries, err := os.ReadDir(dir) + if err != nil { + return err + } + var firstErr error + for _, e := range entries { + if !strings.HasSuffix(e.Name(), ".sfa") { + continue + } + path := filepath.Join(dir, e.Name()) + if rmErr := os.Remove(path); rmErr != nil && firstErr == nil { + firstErr = rmErr + } + } + return firstErr +} diff --git a/qwp_sf_engine_test.go b/qwp_sf_engine_test.go new file mode 100644 index 00000000..b3fb3156 --- /dev/null +++ b/qwp_sf_engine_test.go @@ -0,0 +1,232 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfEngineMemoryModeAppend(t *testing.T) { + e, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e.engineClose() }() + + for i := int64(0); i < 5; i++ { + fsn, err := e.engineAppendBlocking(context.Background(), []byte("frame")) + require.NoError(t, err) + assert.Equal(t, i, fsn) + } + assert.Equal(t, int64(4), e.enginePublishedFsn()) + assert.False(t, e.engineWasRecoveredFromDisk()) + assert.Equal(t, "", e.engineSfDir()) +} + +func TestQwpSfEngineDiskModeWritesAndRecovers(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + + { + e, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + assert.False(t, e.engineWasRecoveredFromDisk()) + + for i := 0; i < 5; i++ { + _, err := e.engineAppendBlocking(context.Background(), []byte{byte(i), byte(i + 1)}) + require.NoError(t, err) + } + assert.Equal(t, int64(4), e.enginePublishedFsn()) + require.NoError(t, e.engineClose()) + } + + // Files should still be on disk (no ACKs were processed). + entries, err := os.ReadDir(dir) + require.NoError(t, err) + sfaCount := 0 + for _, en := range entries { + if filepath.Ext(en.Name()) == ".sfa" { + sfaCount++ + } + } + assert.GreaterOrEqual(t, sfaCount, 1) + + // Recover. + e2, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e2.engineClose() }() + assert.True(t, e2.engineWasRecoveredFromDisk()) + // publishedFsn must still be 4 (5 frames were written). + assert.Equal(t, int64(4), e2.enginePublishedFsn()) +} + +func TestQwpSfEngineSlotLockBlocksDouble(t *testing.T) { + dir := t.TempDir() + e1, err := qwpSfNewCursorEngine(dir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = e1.engineClose() }() + + _, err = qwpSfNewCursorEngine(dir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.Error(t, err) + assert.Contains(t, err.Error(), "slot already in use") +} + +func TestQwpSfEngineFullDrainUnlinksFiles(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + e, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + + for i := 0; i < 3; i++ { + fsn, err := e.engineAppendBlocking(context.Background(), []byte("hi")) + require.NoError(t, err) + // Immediately ACK each frame so the ring fully drains. + e.engineAcknowledge(fsn) + } + require.NoError(t, e.engineClose()) + + // On full drain, the engine unlinks residual .sfa files. Allow + // for a small window where the manager hasn't yet seen the trim; + // engineClose itself unlinks anything still on disk. + entries, err := os.ReadDir(dir) + require.NoError(t, err) + for _, en := range entries { + assert.NotEqual(t, ".sfa", filepath.Ext(en.Name()), + "unexpected leftover segment file %s", en.Name()) + } +} + +func TestQwpSfEngineBackpressureTimeout(t *testing.T) { + const segSize int64 = 96 // 24 header + 72 payload region + // Cap at one segment so the manager never provisions a spare: + // after the active fills, every append blocks until the deadline. + e, err := qwpSfNewCursorEngine("", segSize, segSize, 50*time.Millisecond) + require.NoError(t, err) + defer func() { _ = e.engineClose() }() + + // Fill the active until the next append blocks. capacity = 96-24 + // = 72; each frame uses 8+16 = 24, so 3 frames fit. + for i := 0; i < 3; i++ { + _, err := e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.NoError(t, err, "iteration %d", i) + } + // The next append must time out. + start := time.Now() + _, err = e.engineAppendBlocking(context.Background(), make([]byte, 16)) + elapsed := time.Since(start) + require.Error(t, err) + assert.True(t, errors.Is(err, ErrBackpressureTimeout)) + assert.GreaterOrEqual(t, elapsed, 40*time.Millisecond) + // Backpressure stall counter incremented. + assert.GreaterOrEqual(t, e.engineTotalBackpressureStalls(), int64(1)) + // Spec §16: with no loop wired (or loop reports "not + // reconnecting"), the message must say "publishing but slow". + assert.Contains(t, err.Error(), "wire publishing but slow") +} + +// Spec §16 mandates the backpressure-timeout error distinguish +// "publishing but slow" from "reconnecting", and the reconnecting +// variant must include attempt count and outage start. +func TestQwpSfEngineBackpressureTimeoutReconnecting(t *testing.T) { + const segSize int64 = 96 + e, err := qwpSfNewCursorEngine("", segSize, segSize, 50*time.Millisecond) + require.NoError(t, err) + defer func() { _ = e.engineClose() }() + + outageStart := time.Now().Add(-3 * time.Second) + e.engineSetReconnectStatusGetter(func() (bool, int64, time.Time) { + return true, 7, outageStart + }) + + for i := 0; i < 3; i++ { + _, err := e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.NoError(t, err, "iteration %d", i) + } + _, err = e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.Error(t, err) + assert.True(t, errors.Is(err, ErrBackpressureTimeout)) + msg := err.Error() + assert.Contains(t, msg, "reconnecting") + assert.Contains(t, msg, "attempts=7") + assert.Contains(t, msg, "outage-elapsed=") + assert.Contains(t, msg, "outage-start=") + + // After the loop reports "no longer reconnecting", the next + // timeout falls back to the slow-publish wording. + e.engineSetReconnectStatusGetter(func() (bool, int64, time.Time) { + return false, 0, time.Time{} + }) + _, err = e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.Error(t, err) + assert.Contains(t, err.Error(), "wire publishing but slow") + assert.NotContains(t, err.Error(), "reconnecting") + + // Detaching the getter (nil) is also valid — same fallback wording. + e.engineSetReconnectStatusGetter(nil) + _, err = e.engineAppendBlocking(context.Background(), make([]byte, 16)) + require.Error(t, err) + assert.Contains(t, err.Error(), "wire publishing but slow") +} + +func TestQwpSfEnginePayloadTooLarge(t *testing.T) { + const segSize int64 = 256 + e, err := qwpSfNewCursorEngine("", segSize, segSize*4, time.Second) + require.NoError(t, err) + defer func() { _ = e.engineClose() }() + + huge := make([]byte, segSize) // can never fit (header + envelope alone exceeds) + _, err = e.engineAppendBlocking(context.Background(), huge) + require.Error(t, err) + assert.True(t, errors.Is(err, qwpSfErrPayloadTooLarge)) +} + +func TestQwpSfEngineSharedManager(t *testing.T) { + mgr, err := qwpSfNewSegmentManager(4096, 100*time.Microsecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + mgr.segmentManagerStart() + defer mgr.segmentManagerClose() + + e1, err := qwpSfNewCursorEngineWithManager("", 4096, mgr, time.Second) + require.NoError(t, err) + e2, err := qwpSfNewCursorEngineWithManager("", 4096, mgr, time.Second) + require.NoError(t, err) + + // Both engines should be able to append and have the manager + // supply spares to both rings. + for i := 0; i < 3; i++ { + _, err := e1.engineAppendBlocking(context.Background(), []byte("a")) + require.NoError(t, err) + _, err = e2.engineAppendBlocking(context.Background(), []byte("b")) + require.NoError(t, err) + } + require.NoError(t, e1.engineClose()) + require.NoError(t, e2.engineClose()) +} diff --git a/qwp_sf_fallocate_darwin.go b/qwp_sf_fallocate_darwin.go new file mode 100644 index 00000000..c8fe470f --- /dev/null +++ b/qwp_sf_fallocate_darwin.go @@ -0,0 +1,90 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build darwin + +package questdb + +import ( + "errors" + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// qwpSfReserveNewBlocks reserves real disk blocks for f's range +// [currentSize, currentSize+newBytes) via fcntl(F_PREALLOCATE), in +// two phases — matching the Java reference's native allocate on macOS: +// +// 1. F_ALLOCATECONTIG | F_ALLOCATEALL: try for a single contiguous +// extent first. Best for mmap streaming and least fragmentation, +// but can fail on a fragmented APFS even when free space is +// plentiful. +// 2. On any failure, retry with just F_ALLOCATEALL (relaxed +// contiguity, still all-or-nothing). This is the path that +// surfaces ENOSPC. +// 3. Only when the second attempt fails with ENOTSUP / EOPNOTSUPP do +// we accept a sparse fallback — those errnos indicate the +// filesystem doesn't implement F_PREALLOCATE at all (SMB, +// certain network mounts). Every other failure (notably ENOSPC, +// EFBIG, EIO) surfaces so the caller doesn't end up mmap'ing a +// sparse file that will SIGBUS on first write past the +// actually-allocated region. +// +// F_PEOFPOSMODE positions the allocation immediately after EOF, so +// the caller MUST ensure f's EOF is at currentSize before invoking +// this. qwpSfAllocate guarantees that by fstat'ing first; direct +// callers must do the same. F_PREALLOCATE does NOT advance EOF — +// qwpSfAllocate's ftruncate follow-up handles that. +// +// The currentSize parameter isn't needed by F_PREALLOCATE itself +// (F_PEOFPOSMODE is implicit-from-EOF), but it's kept on the +// signature for cross-platform symmetry and surfaces in error +// messages. +func qwpSfReserveNewBlocks(f *os.File, currentSize, newBytes int64) error { + fstore := &unix.Fstore_t{ + Flags: unix.F_ALLOCATECONTIG | unix.F_ALLOCATEALL, + Posmode: unix.F_PEOFPOSMODE, + Offset: 0, + Length: newBytes, + } + if err := unix.FcntlFstore(f.Fd(), unix.F_PREALLOCATE, fstore); err == nil { + return nil + } + // Contiguous allocation failed (typically fragmented APFS). Retry + // non-contiguous all-or-nothing — this is where ENOSPC surfaces if + // free space is genuinely insufficient. + fstore.Flags = unix.F_ALLOCATEALL + fstore.Bytesalloc = 0 + err := unix.FcntlFstore(f.Fd(), unix.F_PREALLOCATE, fstore) + if err == nil { + return nil + } + if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOTSUP) { + return nil + } + return fmt.Errorf("qwp/sf: F_PREALLOCATE %s offset=%d len=%d: %w", + f.Name(), currentSize, newBytes, err) +} diff --git a/qwp_sf_fallocate_linux.go b/qwp_sf_fallocate_linux.go new file mode 100644 index 00000000..d0337799 --- /dev/null +++ b/qwp_sf_fallocate_linux.go @@ -0,0 +1,73 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build linux + +package questdb + +import ( + "errors" + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// qwpSfReserveNewBlocks reserves real disk blocks for f's range +// [currentSize, currentSize+newBytes) via the fallocate(2) syscall +// with mode 0 — the kernel path glibc's posix_fallocate funnels into +// when the filesystem supports it. Caller-side contract (never shrinks, +// short-circuit, post-truncate) is owned by qwpSfAllocate; this helper +// is single-concern. +// +// Anchoring the reservation at currentSize matches macOS's +// F_PEOFPOSMODE so the two POSIX platforms agree on what gets +// reserved (the newly-extended range only); existing sparse holes in +// [0, currentSize) are not touched. +// +// The errno tolerance list (EOPNOTSUPP / ENOTSUP, EINVAL) matches the +// Java reference's posix_fallocate path: those errnos indicate the +// filesystem cannot reserve, and the spec authorises a sparse +// fallback. All other errnos (notably ENOSPC, EFBIG, EIO) surface as +// errors so the caller doesn't end up mmap'ing a sparse file that +// will SIGBUS on first write past the actually-allocated region. +// +// Unlike Java's posix_fallocate (which has glibc's userspace +// zero-write fallback baked in for kernels missing the fallocate +// syscall), this is the raw syscall — ENOSYS on a pre-2.6.23 kernel +// would surface here. Modern targets are unaffected. +func qwpSfReserveNewBlocks(f *os.File, currentSize, newBytes int64) error { + err := unix.Fallocate(int(f.Fd()), 0, currentSize, newBytes) + if err == nil { + return nil + } + // EOPNOTSUPP and ENOTSUP share the same numeric value on Linux, + // but the unix package exposes both names — accept either symbol + // to stay robust if that ever changes. + if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOTSUP) || errors.Is(err, unix.EINVAL) { + return nil + } + return fmt.Errorf("qwp/sf: fallocate %s offset=%d len=%d: %w", + f.Name(), currentSize, newBytes, err) +} diff --git a/qwp_sf_fallocate_unix_other.go b/qwp_sf_fallocate_unix_other.go new file mode 100644 index 00000000..c17f4ff6 --- /dev/null +++ b/qwp_sf_fallocate_unix_other.go @@ -0,0 +1,44 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build unix && !linux && !darwin + +package questdb + +import "os" + +// qwpSfReserveNewBlocks is a no-op on unix variants without a +// block-reservation syscall wired into golang.org/x/sys/unix here +// (BSDs, Solaris, AIX, illumos). qwpSfAllocate's ftruncate step still +// extends the file to the new logical size, so the call returns +// success as if the spec's sparse-fallback path were taken — blocks +// remain sparse, SIGBUS risk per sf-client.md §6 applies. Operators +// on these targets must size sf_max_bytes conservatively against +// free space. +// +// Add a platform-specific implementation here if QuestDB Go ever +// supports one of these targets in production. +func qwpSfReserveNewBlocks(f *os.File, currentSize, newBytes int64) error { + return nil +} diff --git a/qwp_sf_fallocate_unix_test.go b/qwp_sf_fallocate_unix_test.go new file mode 100644 index 00000000..add2610a --- /dev/null +++ b/qwp_sf_fallocate_unix_test.go @@ -0,0 +1,73 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build linux || darwin + +package questdb + +import ( + "os" + "path/filepath" + "syscall" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestQwpSfSegmentCreateReservesDiskBlocks verifies that a fresh +// segment is NOT sparse — i.e. qwpSfReserveDiskBlocks reached real +// disk-block reservation, not just an ftruncate. We check via +// stat.Blocks, which counts 512-byte units of allocated storage; a +// sparse file would report a Blocks count far below sizeBytes/512. +// +// Skipped on filesystems where the reserve syscall is unsupported +// (Blocks ends up close to zero — same as a plain ftruncate). +// Operators on those filesystems take the SIGBUS risk by design; +// the test is asserting the *typical* dev / CI filesystem path. +func TestQwpSfSegmentCreateReservesDiskBlocks(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "prealloc.sfa") + + // 256 KiB — large enough that a sparse file would have ~0 blocks + // while a real reservation reports >=512 blocks (256 KiB / 512). + const segSize int64 = 256 * 1024 + seg, err := qwpSfCreateSegment(path, 0, segSize) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + st, err := os.Stat(path) + require.NoError(t, err) + stat, ok := st.Sys().(*syscall.Stat_t) + require.True(t, ok, "expected *syscall.Stat_t from os.Stat on unix") + + allocBytes := int64(stat.Blocks) * 512 + if allocBytes < segSize/2 { + t.Skipf("filesystem appears not to support pre-allocation (Blocks=%d, want >= %d); "+ + "SIGBUS risk falls back on operator sizing per spec", + stat.Blocks, segSize/2/512) + } + assert.GreaterOrEqual(t, allocBytes, segSize, + "pre-allocation must reserve >= sizeBytes; sparse file would report a small Blocks count") +} diff --git a/qwp_sf_files_unix.go b/qwp_sf_files_unix.go new file mode 100644 index 00000000..61bc5ca6 --- /dev/null +++ b/qwp_sf_files_unix.go @@ -0,0 +1,96 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build unix + +package questdb + +import ( + "errors" + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// qwpSfMmapRW maps the first sizeBytes of f read-write into a slice +// backed by the kernel mmap region. The returned slice's length and +// capacity equal sizeBytes; indexing into it reads/writes the file +// directly. Caller must qwpSfMunmap before discarding the slice and +// before closing f to avoid leaking mappings. +func qwpSfMmapRW(f *os.File, sizeBytes int64) ([]byte, error) { + if sizeBytes <= 0 { + return nil, fmt.Errorf("qwp/sf: mmap size must be positive: %d", sizeBytes) + } + buf, err := unix.Mmap(int(f.Fd()), 0, int(sizeBytes), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) + if err != nil { + return nil, fmt.Errorf("qwp/sf: mmap %s: %w", f.Name(), err) + } + return buf, nil +} + +// qwpSfMunmap unmaps buf. Safe to call with a nil buf (no-op). +func qwpSfMunmap(buf []byte) error { + if buf == nil { + return nil + } + if err := unix.Munmap(buf); err != nil { + return fmt.Errorf("qwp/sf: munmap: %w", err) + } + return nil +} + +// qwpSfMsync flushes [0, length) of buf to disk synchronously. The +// length must not exceed cap(buf). Used for OS-crash durability when +// the user opts in; off the steady-state hot path. +func qwpSfMsync(buf []byte, length int64) error { + if buf == nil || length <= 0 { + return nil + } + if int(length) > cap(buf) { + return fmt.Errorf("qwp/sf: msync length %d exceeds buf cap %d", length, cap(buf)) + } + // Slice with the original capacity preserved so unix.Msync can pass + // the right address+length pair to the kernel; we don't want to + // reslice arbitrarily because that would change the start offset. + if err := unix.Msync(buf[:length], unix.MS_SYNC); err != nil { + return fmt.Errorf("qwp/sf: msync: %w", err) + } + return nil +} + +// qwpSfFlockExclusive acquires an exclusive non-blocking lock on f. +// Returns qwpSfErrLockBusy on contention with another process. The +// lock is released when f is closed or the process exits (the kernel +// drops flocks on process termination). +func qwpSfFlockExclusive(f *os.File) error { + err := unix.Flock(int(f.Fd()), unix.LOCK_EX|unix.LOCK_NB) + if err == nil { + return nil + } + if errors.Is(err, unix.EWOULDBLOCK) || errors.Is(err, unix.EAGAIN) { + return qwpSfErrLockBusy + } + return fmt.Errorf("qwp/sf: flock %s: %w", f.Name(), err) +} diff --git a/qwp_sf_files_windows.go b/qwp_sf_files_windows.go new file mode 100644 index 00000000..c9dc1d5d --- /dev/null +++ b/qwp_sf_files_windows.go @@ -0,0 +1,189 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//go:build windows + +package questdb + +import ( + "errors" + "fmt" + "os" + "sync" + "unsafe" + + "golang.org/x/sys/windows" +) + +// On Windows, mmap requires a separate file-mapping object handle +// alongside the file handle. We track them in a side map keyed by the +// mmap'd slice's data pointer so the cross-platform helper signatures +// stay aligned with the unix variant. +var ( + qwpSfWindowsMappingMu sync.Mutex + qwpSfWindowsMappings = map[uintptr]windows.Handle{} +) + +// mmapAddrToPointer converts a uintptr returned by MapViewOfFile +// into an unsafe.Pointer addressing the OS-managed mmap region. +// +// Direct `unsafe.Pointer(uintptr_var)` is flagged by go vet's +// unsafeptr analyzer because it cannot tell whether the integer was +// derived from a Go heap pointer (where the GC may relocate the +// referent and invalidate the address). For an OS-managed mmap +// region the warning is a false positive — the kernel pins the +// pages until UnmapViewOfFile. Loading the address through a stack +// alias (&p is a known-valid Go pointer) defeats the analyzer +// without disabling the check globally. +func mmapAddrToPointer(p uintptr) unsafe.Pointer { + return *(*unsafe.Pointer)(unsafe.Pointer(&p)) +} + +// qwpSfMmapRW maps the first sizeBytes of f read-write. See the unix +// counterpart; this version creates a CreateFileMapping+MapViewOfFile +// pair under the hood and tracks the mapping handle for later cleanup. +func qwpSfMmapRW(f *os.File, sizeBytes int64) ([]byte, error) { + if sizeBytes <= 0 { + return nil, fmt.Errorf("qwp/sf: mmap size must be positive: %d", sizeBytes) + } + hi := uint32(sizeBytes >> 32) + lo := uint32(sizeBytes & 0xFFFFFFFF) + mapHandle, err := windows.CreateFileMapping( + windows.Handle(f.Fd()), nil, windows.PAGE_READWRITE, hi, lo, nil) + if err != nil { + return nil, fmt.Errorf("qwp/sf: CreateFileMapping %s: %w", f.Name(), err) + } + addr, err := windows.MapViewOfFile(mapHandle, windows.FILE_MAP_READ|windows.FILE_MAP_WRITE, + 0, 0, uintptr(sizeBytes)) + if err != nil { + _ = windows.CloseHandle(mapHandle) + return nil, fmt.Errorf("qwp/sf: MapViewOfFile %s: %w", f.Name(), err) + } + buf := unsafe.Slice((*byte)(mmapAddrToPointer(addr)), sizeBytes) + qwpSfWindowsMappingMu.Lock() + qwpSfWindowsMappings[addr] = mapHandle + qwpSfWindowsMappingMu.Unlock() + return buf, nil +} + +// qwpSfMunmap unmaps buf and closes its associated file mapping. +func qwpSfMunmap(buf []byte) error { + if buf == nil { + return nil + } + addr := uintptr(unsafe.Pointer(&buf[0])) + qwpSfWindowsMappingMu.Lock() + mapHandle, ok := qwpSfWindowsMappings[addr] + if ok { + delete(qwpSfWindowsMappings, addr) + } + qwpSfWindowsMappingMu.Unlock() + if err := windows.UnmapViewOfFile(addr); err != nil { + return fmt.Errorf("qwp/sf: UnmapViewOfFile: %w", err) + } + if ok { + if err := windows.CloseHandle(mapHandle); err != nil { + return fmt.Errorf("qwp/sf: CloseHandle(mapping): %w", err) + } + } + return nil +} + +// qwpSfMsync synchronously flushes [0, length) of buf to disk. +func qwpSfMsync(buf []byte, length int64) error { + if buf == nil || length <= 0 { + return nil + } + if int(length) > cap(buf) { + return fmt.Errorf("qwp/sf: msync length %d exceeds buf cap %d", length, cap(buf)) + } + addr := uintptr(unsafe.Pointer(&buf[0])) + if err := windows.FlushViewOfFile(addr, uintptr(length)); err != nil { + return fmt.Errorf("qwp/sf: FlushViewOfFile: %w", err) + } + return nil +} + +// qwpSfReserveNewBlocks reserves real disk clusters for f up to +// currentSize+newBytes via SetFileInformationByHandle(FileAllocationInfo). +// On NTFS this reserves clusters synchronously and fails with +// ERROR_DISK_FULL when free space is insufficient. Caller-side +// contract (never shrinks, short-circuit, post-truncate) is owned by +// qwpSfAllocate; this helper is single-concern. +// +// FileAllocationInfo is file-scope, not range-based — there is no +// per-range API on NTFS — so the call implicitly re-reserves +// [0, currentSize) as well. Visible only to a caller who deliberately +// created sparse holes inside that range; the qwpSfAllocate doc flags +// hole-filling as non-portable behaviour. +// +// FileAllocationInfo does NOT extend the file's logical size (EOF); +// qwpSfAllocate's f.Truncate follow-up handles that. Windows has no +// equivalent of the Linux / macOS sparse-fallback path — any failure +// here surfaces as an error. +func qwpSfReserveNewBlocks(f *os.File, currentSize, newBytes int64) error { + target := currentSize + newBytes + // FILE_ALLOCATION_INFO is a single LARGE_INTEGER. Lay it out via a + // fixed-size struct so the &info / Sizeof pair matches the + // kernel's expectation regardless of Go alignment quirks. + info := struct { + AllocationSize int64 + }{AllocationSize: target} + err := windows.SetFileInformationByHandle( + windows.Handle(f.Fd()), + windows.FileAllocationInfo, + (*byte)(unsafe.Pointer(&info)), + uint32(unsafe.Sizeof(info)), + ) + if err != nil { + return fmt.Errorf("qwp/sf: SetFileInformationByHandle(FileAllocationInfo) %s to %d bytes: %w", + f.Name(), target, err) + } + return nil +} + +// qwpSfFlockExclusive acquires an exclusive non-blocking lock on f. +// Implemented via LockFileEx with LOCKFILE_EXCLUSIVE_LOCK|LOCKFILE_FAIL_IMMEDIATELY. +// Returns qwpSfErrLockBusy on contention. +func qwpSfFlockExclusive(f *os.File) error { + const lockBytes uint32 = 1 + // Stack-allocated OVERLAPPED is safe here because LOCKFILE_FAIL_IMMEDIATELY + // forces a synchronous return — the kernel never dereferences &ol after + // LockFileEx returns. Do not remove that flag without switching to a + // heap-allocated OVERLAPPED with an event handle. + var ol windows.Overlapped + err := windows.LockFileEx( + windows.Handle(f.Fd()), + windows.LOCKFILE_EXCLUSIVE_LOCK|windows.LOCKFILE_FAIL_IMMEDIATELY, + 0, lockBytes, 0, &ol) + if err == nil { + return nil + } + // ERROR_LOCK_VIOLATION = 33; ERROR_IO_PENDING = 997 (treated as + // contention by LOCKFILE_FAIL_IMMEDIATELY). + if errors.Is(err, windows.ERROR_LOCK_VIOLATION) || errors.Is(err, windows.ERROR_IO_PENDING) { + return qwpSfErrLockBusy + } + return fmt.Errorf("qwp/sf: LockFileEx %s: %w", f.Name(), err) +} diff --git a/qwp_sf_lock.go b/qwp_sf_lock.go new file mode 100644 index 00000000..781e7024 --- /dev/null +++ b/qwp_sf_lock.go @@ -0,0 +1,169 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" +) + +// qwpSfLockFileName is the per-slot lock file name. One lock file per +// slot directory; held for the engine's lifetime via flock/LockFileEx. +const qwpSfLockFileName = ".lock" + +// qwpSfLockPidFileName is the sibling sidecar that carries the +// holder's PID. The PID lives in a separate file because Windows' +// LockFileEx is a mandatory range lock — while .lock is held, a +// second handle cannot read its bytes, so the holder's PID can't be +// recovered from the lock file itself. POSIX flock is advisory and +// would tolerate co-locating the two, but keeping the layout +// identical across platforms (and matching the Java client) avoids +// platform-specific divergence in tests and tooling. +const qwpSfLockPidFileName = ".lock.pid" + +// qwpSfSlotLock is an advisory exclusive lock on a single SF slot +// directory. The holder's PID is written to a sibling .lock.pid +// sidecar at acquisition time. A failed acquisition reads it back so +// the error message can name the offending process — turning a vague +// "slot in use" into actionable diagnostics. +// +// Two senders pointing at the same slot dir is the multi-writer +// footgun the slot model exists to prevent: their FSN sequences would +// interleave on disk and corrupt recovery. Detecting the collision at +// acquisition time and refusing to start is the contract — recoverable, +// no data on disk yet, vs. the alternative of silently scrambling the +// slot. +// +// The lock is released automatically on close() OR when the process +// exits (the kernel cleans up flocks for terminated processes). +// +// Known operational hole: if an external actor unlinks .lock while it +// is held (e.g., an operator running `rm .lock`), a fresh acquirer's +// open(O_CREATE) allocates a new inode and successfully flocks it — +// both processes then believe they own the slot. flock(2), POSIX +// fcntl(F_SETLK), and Linux F_OFD_SETLK are all inode-bound on +// Linux/BSD; no POSIX primitive is path-bound, so this cannot be +// closed client-side. Operators must treat .lock as opaque metadata +// and not delete it while a sender is running against the slot. The +// Java MmapSegment SlotLock has the same property. +type qwpSfSlotLock struct { + slotDir string + lockPath string + file *os.File +} + +// qwpSfAcquireSlotLock creates slotDir if needed, opens +// `/.lock`, and acquires an exclusive flock on it. On +// contention, reads the existing PID payload from the .lock.pid +// sidecar and returns an error naming the offending process. +func qwpSfAcquireSlotLock(slotDir string) (*qwpSfSlotLock, error) { + if slotDir == "" { + return nil, errors.New("qwp/sf: slotDir must not be empty") + } + if err := os.MkdirAll(slotDir, 0o755); err != nil { + return nil, fmt.Errorf("qwp/sf: could not create slot dir %s: %w", slotDir, err) + } + lockPath := filepath.Join(slotDir, qwpSfLockFileName) + pidPath := filepath.Join(slotDir, qwpSfLockPidFileName) + f, err := os.OpenFile(lockPath, os.O_RDWR|os.O_CREATE, 0o644) + if err != nil { + return nil, fmt.Errorf("qwp/sf: could not open slot lock file %s: %w", lockPath, err) + } + if err := qwpSfFlockExclusive(f); err != nil { + holder := qwpSfReadHolder(pidPath) + _ = f.Close() + if errors.Is(err, qwpSfErrLockBusy) { + return nil, fmt.Errorf( + "qwp/sf: slot already in use by another process [slot=%s, holder=%s]", + slotDir, holder) + } + return nil, err + } + qwpSfWritePid(pidPath) + return &qwpSfSlotLock{ + slotDir: slotDir, + lockPath: lockPath, + file: f, + }, nil +} + +// qwpSfReadHolder reads the PID payload of an existing .lock.pid +// sidecar. Best-effort — returns "unknown" if the file can't be read +// or the payload is empty. The caller is in the error path; we never +// want a failed PID-read to mask the original lock-busy error. +func qwpSfReadHolder(pidPath string) string { + f, err := os.Open(pidPath) + if err != nil { + return "unknown" + } + defer f.Close() + // 64 bytes is more than enough for "\n" — clamp so a vandal + // can't make us read MB of payload on the error path. + buf := make([]byte, 64) + n, err := f.Read(buf) + if err != nil && !errors.Is(err, io.EOF) { + return "unknown" + } + if n <= 0 { + return "unknown" + } + return "pid=" + strings.TrimSpace(string(buf[:n])) +} + +// qwpSfWritePid writes the current process's PID to the .lock.pid +// sidecar. Diagnostic-only — never block lock acquisition on it; a +// failed write only degrades the contention error message, it does +// not affect correctness of the lock itself. +func qwpSfWritePid(pidPath string) { + f, err := os.OpenFile(pidPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + return + } + defer f.Close() + payload := fmt.Sprintf("%d\n", os.Getpid()) + _, _ = f.WriteAt([]byte(payload), 0) +} + +// slotPath returns the slot directory this lock guards. +func (l *qwpSfSlotLock) slotPath() string { + return l.slotDir +} + +// close releases the lock by closing the underlying file. We do NOT +// remove the file — a stale .lock with the previous PID is harmless +// (the next acquirer can flock it just fine, and overwrites the PID +// on success). Idempotent. +func (l *qwpSfSlotLock) close() error { + if l == nil || l.file == nil { + return nil + } + err := l.file.Close() + l.file = nil + return err +} diff --git a/qwp_sf_lock_test.go b/qwp_sf_lock_test.go new file mode 100644 index 00000000..edc4917d --- /dev/null +++ b/qwp_sf_lock_test.go @@ -0,0 +1,103 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfSlotLockAcquireCreatesDirAndLockFile(t *testing.T) { + dir := filepath.Join(t.TempDir(), "child", "slot") + l, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + defer func() { _ = l.close() }() + + // Directory was auto-created. + st, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, st.IsDir()) + + // .lock file exists and is empty — the locked range on Windows + // would otherwise prevent a contender from reading the PID. + lockBody, err := os.ReadFile(filepath.Join(dir, qwpSfLockFileName)) + require.NoError(t, err) + assert.Empty(t, lockBody) + + // .lock.pid sidecar holds our PID. + pidBody, err := os.ReadFile(filepath.Join(dir, qwpSfLockPidFileName)) + require.NoError(t, err) + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBody))) + require.NoError(t, err) + assert.Equal(t, os.Getpid(), pid) +} + +func TestQwpSfSlotLockContentionReportsHolder(t *testing.T) { + dir := t.TempDir() + l1, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + defer func() { _ = l1.close() }() + + _, err = qwpSfAcquireSlotLock(dir) + require.Error(t, err) + assert.Contains(t, err.Error(), "slot already in use") + assert.Contains(t, err.Error(), fmt.Sprintf("pid=%d", os.Getpid())) +} + +func TestQwpSfSlotLockReleaseAllowsReacquire(t *testing.T) { + dir := t.TempDir() + l1, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + require.NoError(t, l1.close()) + + // Stale .lock file should still exist (we never unlink) but the + // flock is gone, so a fresh acquire succeeds. + _, err = os.Stat(filepath.Join(dir, qwpSfLockFileName)) + require.NoError(t, err) + + l2, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + require.NoError(t, l2.close()) +} + +func TestQwpSfSlotLockEmptyDirIsRejected(t *testing.T) { + _, err := qwpSfAcquireSlotLock("") + require.Error(t, err) +} + +func TestQwpSfSlotLockReportsSlotPath(t *testing.T) { + dir := t.TempDir() + l, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + defer func() { _ = l.close() }() + assert.Equal(t, dir, l.slotPath()) +} diff --git a/qwp_sf_manager.go b/qwp_sf_manager.go new file mode 100644 index 00000000..d3967856 --- /dev/null +++ b/qwp_sf_manager.go @@ -0,0 +1,473 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "errors" + "fmt" + "log" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +// qwpSfManager defaults and constants. +const ( + qwpSfManagerDefaultPoll = 1 * time.Millisecond // poll cadence + qwpSfManagerDiskFullLogThrottle = 30 * time.Second // throttle disk-full WARNs + // qwpSfManagerCloseGrace bounds how long close() waits for the + // worker goroutine to exit cleanly. Mirrors Java's 5-second join. + qwpSfManagerCloseGrace = 5 * time.Second +) + +// qwpSfUnlimitedTotalBytes disables the per-engine total-bytes cap. +const qwpSfUnlimitedTotalBytes int64 = math.MaxInt64 + +// qwpSfSegmentManager is the background worker that keeps every +// registered qwpSfSegmentRing supplied with a hot-spare segment and +// trims segments after their frames have been ACK'd. Off the +// user-thread / I/O-thread hot path entirely: the expensive +// open+truncate+mmap for spare creation and munmap+unlink for trim +// happen on this goroutine, never on the latency-sensitive paths. +// +// One instance can serve many rings (typically all sender instances +// in a process). Polls each ring on a configurable tick (default +// 1 ms) — short enough that a producer rarely sees +// qwpSfBackpressureNoSpare in the steady state, long enough that an +// idle process doesn't burn CPU. +type qwpSfSegmentManager struct { + segmentSizeBytes int64 + pollInterval time.Duration + maxTotalBytes int64 + + // fileGeneration is a monotonic counter that names spare files + // (sf-.sfa). Per-process, not per-ring; recovery skips + // the counter past existing on-disk segments at register time. + fileGeneration atomic.Uint64 + + mu sync.Mutex + rings []qwpSfManagerRingEntry + totalBytes int64 + lastDiskFullLog time.Time + closed bool + + // wakeup is a single-slot channel. wakeWorker pushes into it + // non-blockingly; the worker drains in select to coalesce signals. + wakeup chan struct{} + // done is closed when the worker goroutine exits. + done chan struct{} + worker sync.WaitGroup + + // ringSnapshot is workerLoop's reusable copy of rings. Each tick + // refills it from rings under mu, then releases mu before the + // per-ring service pass so the slow segment syscalls run without + // the lock held. Owned solely by workerLoop; the locked refill is + // its only synchronization. + ringSnapshot []qwpSfManagerRingEntry +} + +// qwpSfManagerRingEntry holds a registered ring and the directory +// its segments live in (nil for memory-mode rings). +type qwpSfManagerRingEntry struct { + ring *qwpSfSegmentRing + dir string + // watermark is the engine-owned .ack-watermark for this slot, or + // nil in memory mode / when the file could not be opened. The + // manager writes through it on every tick where ackedFsn + // advanced; it never closes it (the owning engine does, in + // engineClose, after the manager has stopped). The pointer is + // copied by value into the per-tick ring snapshot, but the + // persist state (lastPersistedAck) lives behind the pointer on + // the watermark itself, so the snapshot copy is harmless. + watermark *qwpSfAckWatermark +} + +// qwpSfNewSegmentManager constructs a manager with the given +// segment size, poll interval, and total-bytes cap. maxTotalBytes +// must be at least one segment. +func qwpSfNewSegmentManager(segmentSizeBytes int64, pollInterval time.Duration, maxTotalBytes int64) (*qwpSfSegmentManager, error) { + if segmentSizeBytes < qwpSfHeaderSize+qwpSfFrameHeaderSize+1 { + return nil, fmt.Errorf("qwp/sf: segmentSizeBytes too small: %d", segmentSizeBytes) + } + if maxTotalBytes < segmentSizeBytes { + return nil, fmt.Errorf("qwp/sf: maxTotalBytes (%d) must allow at least one segment of %d bytes", + maxTotalBytes, segmentSizeBytes) + } + if pollInterval <= 0 { + pollInterval = qwpSfManagerDefaultPoll + } + return &qwpSfSegmentManager{ + segmentSizeBytes: segmentSizeBytes, + pollInterval: pollInterval, + maxTotalBytes: maxTotalBytes, + wakeup: make(chan struct{}, 1), + done: make(chan struct{}), + }, nil +} + +// segmentManagerStart spawns the worker goroutine. Idempotent — a +// second call is a panic, mirroring Java's IllegalStateException. +func (m *qwpSfSegmentManager) segmentManagerStart() { + m.mu.Lock() + if m.closed { + m.mu.Unlock() + panic("qwp/sf: segment manager already closed") + } + m.mu.Unlock() + m.worker.Add(1) + go m.workerLoop() +} + +// segmentManagerClose stops the worker goroutine and waits up to +// qwpSfManagerCloseGrace for it to exit. After close, the manager +// rejects new registrations and the worker no longer provisions or +// trims segments — but already-installed spares stay with their +// rings (the rings close them on their own segmentRingClose). +// +// Idempotent; safe to call from any goroutine. +func (m *qwpSfSegmentManager) segmentManagerClose() { + m.mu.Lock() + if m.closed { + m.mu.Unlock() + return + } + m.closed = true + m.mu.Unlock() + // Wake the worker so it observes closed and exits promptly. + select { + case m.wakeup <- struct{}{}: + default: + } + // Bound the wait so a stuck worker can't deadlock close(). + doneCh := make(chan struct{}) + go func() { + m.worker.Wait() + close(doneCh) + }() + graceTimer := time.NewTimer(qwpSfManagerCloseGrace) + defer graceTimer.Stop() + select { + case <-doneCh: + case <-graceTimer.C: + } +} + +// segmentManagerDeregister stops tracking the given ring. Pending +// spares for the ring are NOT created after this returns, but +// already-installed spares stay with the ring. Idempotent; safe to +// call from any goroutine. +func (m *qwpSfSegmentManager) segmentManagerDeregister(ring *qwpSfSegmentRing) { + m.mu.Lock() + defer m.mu.Unlock() + for i, e := range m.rings { + if e.ring == ring { + // Reverse the ring's contribution to totalBytes. + m.totalBytes -= ring.totalSegmentBytes() + // O(N) remove preserving order — register order matters + // for log ordering, not correctness. + m.rings = append(m.rings[:i], m.rings[i+1:]...) + return + } + } +} + +// segmentManagerRegister registers a ring with no ack-watermark +// (memory mode, or callers that don't persist a watermark — chiefly +// tests). Recovery for such a slot seeds from the segment-derived +// lowestBase-1 only. +func (m *qwpSfSegmentManager) segmentManagerRegister(ring *qwpSfSegmentRing, dir string) error { + return m.segmentManagerRegisterWithWatermark(ring, dir, nil) +} + +// segmentManagerRegisterWithWatermark registers a ring for ongoing +// spare creation + trim. dir is the filesystem directory the ring's +// segments live in — used both for creating spare files and +// unlinking trimmed ones. watermark (may be nil) is the slot's +// engine-owned .ack-watermark the manager keeps current on every +// tick; the manager never closes it. The ring MUST already have its +// initial active segment in place. Wires the ring's "I need a spare" +// callback so the producer can preempt the polling tick. +func (m *qwpSfSegmentManager) segmentManagerRegisterWithWatermark(ring *qwpSfSegmentRing, dir string, watermark *qwpSfAckWatermark) error { + m.mu.Lock() + if m.closed { + m.mu.Unlock() + return errors.New("qwp/sf: segment manager closed") + } + m.rings = append(m.rings, qwpSfManagerRingEntry{ring: ring, dir: dir, watermark: watermark}) + // Account for bytes the ring already owns when it joins. A + // recovered ring (post-restart, orphan adoption) can come up + // at-or-above the cap; without this seed, totalBytes stays at 0 + // and the per-tick cap check would let the manager keep + // provisioning new spares on top of the recovered set. + m.totalBytes += ring.totalSegmentBytes() + m.mu.Unlock() + if dir != "" { + // Skip the file-generation counter past whatever's already on + // disk in this slot. Without this, on recovery the manager + // would mint a new spare at sf-0000000000000000.sfa — and + // open-clean-RW would truncate the user's existing active + // file out from under the I/O loop, scrambling the in-flight + // mmap. + if maxGen, found := qwpSfScanMaxGeneration(dir); found { + minNext := maxGen + 1 + for { + cur := m.fileGeneration.Load() + if cur >= minNext { + break + } + if m.fileGeneration.CompareAndSwap(cur, minNext) { + break + } + } + } + } + ring.setManagerWakeup(m.wakeWorker) + return nil +} + +// wakeWorker pushes a non-blocking wakeup so the worker processes +// registered rings on the very next loop iteration. Cheap; safe to +// call from any goroutine; idempotent (multiple wakeups coalesce +// into a single channel slot). No-op when the worker is busy. +func (m *qwpSfSegmentManager) wakeWorker() { + select { + case m.wakeup <- struct{}{}: + default: + } +} + +// qwpSfScanMaxGeneration returns the highest hex-encoded generation +// across sf-.sfa files in dir. found is false when dir is +// absent/unreadable or holds no matching files; maxGen is then +// unspecified and the caller must not constrain fileGeneration. Skips +// files that don't match the pattern (e.g. the legacy sf-initial.sfa). +func qwpSfScanMaxGeneration(dir string) (maxGen uint64, found bool) { + if _, err := os.Stat(dir); err != nil { + return 0, false + } + entries, err := os.ReadDir(dir) + if err != nil { + return 0, false + } + for _, e := range entries { + name := e.Name() + if !strings.HasPrefix(name, "sf-") || !strings.HasSuffix(name, ".sfa") { + continue + } + hex := name[3 : len(name)-4] + if len(hex) != 16 { + continue + } + gen, err := strconv.ParseUint(hex, 16, 64) + if err != nil { + continue + } + if !found || gen > maxGen { + maxGen = gen + found = true + } + } + return maxGen, found +} + +// nextSparePath returns the next available /sf-.sfa +// path. Spare files use a process-wide monotonic counter rather than +// a baseSeq-derived name, because the spare's baseSeq is provisional +// at create time. Recovery discovers segments by extension + header +// magic, not by filename. +func (m *qwpSfSegmentManager) nextSparePath(dir string) string { + gen := m.fileGeneration.Add(1) - 1 + return filepath.Join(dir, fmt.Sprintf("sf-%016x.sfa", gen)) +} + +// workerLoop runs until the manager is closed. Each iteration walks +// the registered rings, provisions a spare for any that need one +// (subject to the totalBytes cap), and trims fully-acked sealed +// segments. Sleeps pollInterval between iterations; pre-empted by a +// wakeWorker signal from the producer. +func (m *qwpSfSegmentManager) workerLoop() { + defer m.worker.Done() + defer close(m.done) + timer := time.NewTimer(m.pollInterval) + defer timer.Stop() + for { + // Refill the reusable ring snapshot so we don't hold the mutex + // through the (potentially slow) syscalls during creation / + // unlink. + m.mu.Lock() + if m.closed { + m.mu.Unlock() + return + } + m.ringSnapshot = append(m.ringSnapshot[:0], m.rings...) + m.mu.Unlock() + for _, e := range m.ringSnapshot { + m.serviceRing(e) + } + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + timer.Reset(m.pollInterval) + select { + case <-m.wakeup: + case <-timer.C: + } + } +} + +// serviceRing performs one round of spare provisioning and trim for +// a single ring. Cheap when the ring already has a spare and no +// trimmable sealed segments — the common steady-state case. +func (m *qwpSfSegmentManager) serviceRing(e qwpSfManagerRingEntry) { + memoryMode := e.dir == "" + if e.ring.needsHotSpare() { + // Snapshot totalBytes under lock — register/deregister can + // mutate it from caller goroutines. Heavy provisioning I/O + // happens outside the lock; the post-install commit + // re-acquires it. + m.mu.Lock() + observedTotal := m.totalBytes + m.mu.Unlock() + if observedTotal+m.segmentSizeBytes > m.maxTotalBytes { + // Disk/memory cap reached: skip provisioning. Producers + // will block on engineAppendBlocking until in-flight + // segments are ACK'd and trimmed, so this state is exactly + // the one operators need surfaced. Logged at most once per + // qwpSfManagerDiskFullLogThrottle so a sustained cap-full + // state doesn't drown logs. The log write happens after the + // lock is released to keep the syscall off m.mu. + now := time.Now() + m.mu.Lock() + shouldLog := now.Sub(m.lastDiskFullLog) >= qwpSfManagerDiskFullLogThrottle + if shouldLog { + m.lastDiskFullLog = now + } + m.mu.Unlock() + if shouldLog { + if memoryMode { + log.Printf("[WARN] qwp/sf: in-memory segment cap reached "+ + "(%d/%d bytes used, segment size %d); spare provisioning "+ + "paused — producers block until in-flight segments are "+ + "ACK'd and trimmed", + observedTotal, m.maxTotalBytes, m.segmentSizeBytes) + } else { + log.Printf("[WARN] qwp/sf: disk cap reached for %q "+ + "(%d/%d bytes used, segment size %d); spare provisioning "+ + "paused — producers block until in-flight segments are "+ + "ACK'd and trimmed", + e.dir, observedTotal, m.maxTotalBytes, m.segmentSizeBytes) + } + } + } else { + var ( + spare *qwpSfSegment + path string + err error + ) + if memoryMode { + spare, err = qwpSfCreateInMemorySegment(e.ring.nextSeqHint(), m.segmentSizeBytes) + } else { + path = m.nextSparePath(e.dir) + spare, err = qwpSfCreateSegment(path, e.ring.nextSeqHint(), m.segmentSizeBytes) + } + if err == nil { + // Install + commit atomically under the manager lock. + // If e.ring was deregistered between the snapshot + // above and now, abandoning the spare here is the + // only way to keep totalBytes consistent. + m.mu.Lock() + stillRegistered := false + for i := range m.rings { + if m.rings[i].ring == e.ring { + stillRegistered = true + break + } + } + installed := false + if stillRegistered { + installErr := e.ring.installHotSpare(spare) + if installErr == nil { + m.totalBytes += m.segmentSizeBytes + installed = true + } + } + m.mu.Unlock() + if !installed { + _ = spare.close() + if path != "" { + _ = os.Remove(path) + } + } + } else if path != "" { + // Defense-in-depth: qwpSfCreateSegment already best- + // effort removes the file on its own failure paths + // (truncate fail, mmap fail). If a future change + // breaks that invariant — or if anything before the + // try block leaves a file on disk — this second-line + // remove keeps the slot from accumulating zero-content + // .sfa files under sustained provisioning failure. + // Repeated remove on an already-removed path is a + // harmless no-op. + _ = os.Remove(path) + } + } + } + + // 2. Persist the current ackedFsn to the slot's .ack-watermark + // BEFORE the trim runs (sf-client.md §5.4). The ordering is + // what makes recovery's max(lowestSurvivingBaseSeq-1, + // watermark) clamp crash-safe in either direction: a crash + // after persist but before the unlinks leaves segments on disk + // with a correct watermark; a crash after the unlinks leaves a + // stale-low watermark the higher lowestBase overrides. The + // write is gated on advance, so a steady ackedFsn doesn't + // dirty the mapped page every tick. nil watermark (memory + // mode / open failed) is a no-op. + e.watermark.persistIfAdvanced(e.ring.segmentRingAckedFsn()) + + // 3. Trim any segments that the ring says are fully acked. For + // memory-mode rings, "trim" is just close (the slice is GC'd) — + // no file to unlink. + trim := e.ring.drainTrimmable() + for _, s := range trim { + path := s.segmentPath() + sz := s.segmentSize() + _ = s.close() + if path != "" { + _ = os.Remove(path) + } + m.mu.Lock() + m.totalBytes -= sz + m.mu.Unlock() + } +} diff --git a/qwp_sf_manager_test.go b/qwp_sf_manager_test.go new file mode 100644 index 00000000..6495da4f --- /dev/null +++ b/qwp_sf_manager_test.go @@ -0,0 +1,191 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfManagerProvisionsSpare(t *testing.T) { + const segSize int64 = 4096 + mgr, err := qwpSfNewSegmentManager(segSize, 100*time.Microsecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + mgr.segmentManagerStart() + defer mgr.segmentManagerClose() + + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + require.NoError(t, mgr.segmentManagerRegister(r, "")) // memory mode + + // Wait for the worker to provision a spare. + require.Eventually(t, func() bool { + return !r.needsHotSpare() + }, 1*time.Second, 1*time.Millisecond) +} + +func TestQwpSfManagerTrimsAckedSegments(t *testing.T) { + const segSize int64 = 72 // two minimal frames per segment + mgr, err := qwpSfNewSegmentManager(segSize, 100*time.Microsecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + mgr.segmentManagerStart() + defer mgr.segmentManagerClose() + + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + require.NoError(t, mgr.segmentManagerRegister(r, "")) + + // Wait for the manager to provision a spare. + require.Eventually(t, func() bool { + return !r.needsHotSpare() + }, 1*time.Second, 1*time.Millisecond) + + // Append three frames to roll one segment into sealed. + payload := make([]byte, 16) + for i := 0; i < 3; i++ { + fsn := r.appendOrFsn(payload) + require.GreaterOrEqual(t, fsn, int64(0), "iteration %d", i) + } + // The manager worker is running, so observe the ring through the + // lock-protected accessors (sealedSegmentCount / firstSealed), not + // the non-thread-safe getSealedSegments. + require.Equal(t, 1, r.sealedSegmentCount()) + sealedBefore := r.firstSealed() + require.NotNil(t, sealedBefore) + r.acknowledge(sealedBefore.segmentBaseSeq() + sealedBefore.segmentFrameCount() - 1) + + // Manager should pick up the trim within a few ticks. + require.Eventually(t, func() bool { + return r.sealedSegmentCount() == 0 + }, 1*time.Second, 1*time.Millisecond) +} + +func TestQwpSfManagerProvisionsDiskSpare(t *testing.T) { + dir := t.TempDir() + const segSize int64 = 4096 + mgr, err := qwpSfNewSegmentManager(segSize, 100*time.Microsecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + mgr.segmentManagerStart() + defer mgr.segmentManagerClose() + + first, err := qwpSfCreateSegment(filepath.Join(dir, "sf-initial.sfa"), 0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + require.NoError(t, mgr.segmentManagerRegister(r, dir)) + + require.Eventually(t, func() bool { + return !r.needsHotSpare() + }, 1*time.Second, 1*time.Millisecond) + + // A second .sfa file (the spare) should now exist on disk. + entries, err := os.ReadDir(dir) + require.NoError(t, err) + count := 0 + for _, e := range entries { + if filepath.Ext(e.Name()) == ".sfa" { + count++ + } + } + assert.GreaterOrEqual(t, count, 2) +} + +func TestQwpSfManagerCapBlocksSpare(t *testing.T) { + const segSize int64 = 4096 + // Cap at exactly one segment — manager refuses to provision a + // spare while the active is the only segment. + mgr, err := qwpSfNewSegmentManager(segSize, 100*time.Microsecond, segSize) + require.NoError(t, err) + mgr.segmentManagerStart() + defer mgr.segmentManagerClose() + + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + require.NoError(t, mgr.segmentManagerRegister(r, "")) + + // Give the manager a few ticks. It should keep refusing to + // install — needsHotSpare stays true. + time.Sleep(50 * time.Millisecond) + assert.True(t, r.needsHotSpare()) +} + +func TestQwpSfManagerRegisterAfterCloseRejects(t *testing.T) { + mgr, err := qwpSfNewSegmentManager(4096, time.Millisecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + mgr.segmentManagerStart() + mgr.segmentManagerClose() + + first, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, 4096) + defer func() { _ = r.segmentRingClose() }() + err = mgr.segmentManagerRegister(r, "") + require.Error(t, err) +} + +func TestQwpSfManagerScanMaxGenerationOnEmptyDir(t *testing.T) { + dir := t.TempDir() + _, found := qwpSfScanMaxGeneration(dir) + // No segments → not found; caller leaves fileGeneration unconstrained. + assert.False(t, found) +} + +func TestQwpSfManagerScanMaxGenerationFindsHighest(t *testing.T) { + dir := t.TempDir() + for _, name := range []string{ + "sf-0000000000000005.sfa", + "sf-000000000000000a.sfa", + "sf-000000000000000c.sfa", + "sf-initial.sfa", // skipped (legacy non-hex name) + } { + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte{}, 0o644)) + } + v, found := qwpSfScanMaxGeneration(dir) + require.True(t, found) + assert.Equal(t, uint64(0xc), v) +} + +func TestQwpSfManagerNextSparePathIncrements(t *testing.T) { + mgr, err := qwpSfNewSegmentManager(4096, time.Millisecond, qwpSfUnlimitedTotalBytes) + require.NoError(t, err) + dir := t.TempDir() + a := mgr.nextSparePath(dir) + b := mgr.nextSparePath(dir) + assert.NotEqual(t, a, b) + assert.Equal(t, filepath.Join(dir, "sf-0000000000000000.sfa"), a) + assert.Equal(t, filepath.Join(dir, "sf-0000000000000001.sfa"), b) +} diff --git a/qwp_sf_orphan.go b/qwp_sf_orphan.go new file mode 100644 index 00000000..03e3de1e --- /dev/null +++ b/qwp_sf_orphan.go @@ -0,0 +1,120 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "os" + "path/filepath" + "strings" +) + +// qwpSfFailedSentinelName is the per-slot file that disqualifies a +// slot from auto-drain. Drainers drop it when their reconnect cap +// exhausts, auth fails, or recovery is corrupt — bounded retry, +// then human-in-the-loop. +const qwpSfFailedSentinelName = ".failed" + +// qwpSfScanOrphans walks the group root sfDir and returns every +// child directory that: +// - is not the caller's own slot (filtered by excludeSlotName) +// - contains at least one *.sfa segment file +// - does NOT contain the .failed sentinel +// +// Lock state is intentionally not part of the candidate filter — +// testing it requires actually opening + flocking the lock file, +// which races with concurrent drainers/senders. The drainer pool +// attempts to acquire each candidate's lock in turn and skips ones +// that fail; this keeps the scanner pure and read-only. +// +// Returns an empty list if sfDir doesn't exist or is empty. +func qwpSfScanOrphans(sfDir, excludeSlotName string) []string { + if sfDir == "" { + return nil + } + if _, err := os.Stat(sfDir); err != nil { + return nil + } + entries, err := os.ReadDir(sfDir) + if err != nil { + return nil + } + var orphans []string + for _, e := range entries { + if !e.IsDir() { + continue + } + name := e.Name() + if name == "." || name == ".." { + continue + } + if excludeSlotName != "" && name == excludeSlotName { + continue + } + slotPath := filepath.Join(sfDir, name) + if qwpSfIsCandidateOrphan(slotPath) { + orphans = append(orphans, slotPath) + } + } + return orphans +} + +// qwpSfIsCandidateOrphan reports whether slotPath looks like a slot +// dir with unacked data and no failure sentinel. Visible for tests. +func qwpSfIsCandidateOrphan(slotPath string) bool { + if _, err := os.Stat(slotPath); err != nil { + return false + } + if _, err := os.Stat(filepath.Join(slotPath, qwpSfFailedSentinelName)); err == nil { + return false + } + return qwpSfHasAnySegmentFile(slotPath) +} + +// qwpSfMarkSlotFailed drops a .failed file in slotPath with the +// given reason as content. Idempotent — overwrites on each call so +// the latest reason is recorded. Best-effort. +func qwpSfMarkSlotFailed(slotPath, reason string) { + path := filepath.Join(slotPath, qwpSfFailedSentinelName) + body := reason + if body == "" { + body = "drainer failed" + } + _ = os.WriteFile(path, []byte(body), 0o644) +} + +// qwpSfHasAnySegmentFile reports whether slotPath contains at least +// one *.sfa file. +func qwpSfHasAnySegmentFile(slotPath string) bool { + entries, err := os.ReadDir(slotPath) + if err != nil { + return false + } + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), ".sfa") { + return true + } + } + return false +} diff --git a/qwp_sf_orphan_test.go b/qwp_sf_orphan_test.go new file mode 100644 index 00000000..0d1611b5 --- /dev/null +++ b/qwp_sf_orphan_test.go @@ -0,0 +1,618 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfScanOrphansFindsCandidates(t *testing.T) { + root := t.TempDir() + + // orphan-1: has a .sfa file → candidate + require.NoError(t, os.MkdirAll(filepath.Join(root, "orphan-1"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(root, "orphan-1", "sf-x.sfa"), []byte{}, 0o644)) + + // orphan-2: has .sfa AND .failed sentinel → NOT a candidate + require.NoError(t, os.MkdirAll(filepath.Join(root, "orphan-2"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(root, "orphan-2", "sf-x.sfa"), []byte{}, 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(root, "orphan-2", qwpSfFailedSentinelName), []byte{}, 0o644)) + + // orphan-3: empty dir → NOT a candidate + require.NoError(t, os.MkdirAll(filepath.Join(root, "orphan-3"), 0o755)) + + // orphan-4: has .lock but no .sfa → NOT a candidate + require.NoError(t, os.MkdirAll(filepath.Join(root, "orphan-4"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(root, "orphan-4", ".lock"), []byte{}, 0o644)) + + // own-slot: filtered by name + require.NoError(t, os.MkdirAll(filepath.Join(root, "own-slot"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(root, "own-slot", "sf-x.sfa"), []byte{}, 0o644)) + + orphans := qwpSfScanOrphans(root, "own-slot") + require.Len(t, orphans, 1) + assert.Equal(t, filepath.Join(root, "orphan-1"), orphans[0]) +} + +func TestQwpSfScanOrphansEmptyDirReturnsNothing(t *testing.T) { + root := t.TempDir() + assert.Empty(t, qwpSfScanOrphans(root, "")) +} + +func TestQwpSfScanOrphansMissingDirReturnsNothing(t *testing.T) { + assert.Empty(t, qwpSfScanOrphans("/nonexistent/path", "")) +} + +func TestQwpSfMarkSlotFailed(t *testing.T) { + root := t.TempDir() + qwpSfMarkSlotFailed(root, "test reason") + body, err := os.ReadFile(filepath.Join(root, qwpSfFailedSentinelName)) + require.NoError(t, err) + assert.Equal(t, "test reason", string(body)) +} + +func TestQwpSfDrainerDrainsRealOrphan(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + dir := t.TempDir() + + // Stand up a "previous session" that wrote frames + closed. + // Since the engine clears residual files on full drain, we need + // to leave the slot un-drained. Easiest: use a separate engine + // with no I/O loop to populate the slot, then close without + // ACKing. + const segSize int64 = 4096 + { + engine, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + for i := 0; i < 3; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + } + // Don't acknowledge → engineClose leaves residual .sfa files. + require.NoError(t, engine.engineClose()) + } + // Confirm there's a .sfa file to drain. + entries, err := os.ReadDir(dir) + require.NoError(t, err) + hasFile := false + for _, e := range entries { + if filepath.Ext(e.Name()) == ".sfa" { + hasFile = true + } + } + require.True(t, hasFile, "expected leftover .sfa for drainer to pick up") + + // Run a drainer. + drainer := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + nil, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainer.drainerRun(context.Background()) + + assert.Equal(t, qwpSfDrainOutcomeSuccess, drainer.drainerOutcome()) + assert.Equal(t, drainer.drainerTargetFsn(), drainer.drainerAckedFsn()) + assert.GreaterOrEqual(t, srv.totalFramesReceived.Load(), int64(1)) +} + +func TestQwpSfDrainerSkipsLockedSlot(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + dir := t.TempDir() + // Hold the slot lock for the duration of the drainer's run. + lock, err := qwpSfAcquireSlotLock(dir) + require.NoError(t, err) + defer func() { _ = lock.close() }() + + drainer := qwpSfNewOrphanDrainer( + dir, 4096, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + nil, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainer.drainerRun(context.Background()) + + assert.Equal(t, qwpSfDrainOutcomeLockedByOther, drainer.drainerOutcome()) + // Locked slots must NOT be marked .failed (contention is normal). + _, err = os.Stat(filepath.Join(dir, qwpSfFailedSentinelName)) + assert.True(t, os.IsNotExist(err), "drainer wrongly created .failed on lock contention") +} + +func TestQwpSfDrainerMarksFailedOnAuthRejection(t *testing.T) { + authSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{upgradeStatus: 401}) + defer authSrv.Close() + + dir := t.TempDir() + // Populate the slot with unacked data. + const segSize int64 = 4096 + { + engine, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("data")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + } + + drainer := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(authSrv), + nil, + 200*time.Millisecond, 10*time.Millisecond, 50*time.Millisecond, + ) + drainer.drainerRun(context.Background()) + + assert.Equal(t, qwpSfDrainOutcomeFailed, drainer.drainerOutcome()) + body, err := os.ReadFile(filepath.Join(dir, qwpSfFailedSentinelName)) + require.NoError(t, err) + assert.Contains(t, string(body), "connect") +} + +func TestQwpSfDrainerSucceedsOnAlreadyDrainedSlot(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + dir := t.TempDir() + + drainer := qwpSfNewOrphanDrainer( + dir, 4096, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + nil, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainer.drainerRun(context.Background()) + + assert.Equal(t, qwpSfDrainOutcomeSuccess, drainer.drainerOutcome()) +} + +func TestQwpSfDrainerPoolSubmitAndClose(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + pool := qwpSfNewDrainerPool(2) + defer pool.drainerPoolClose() + + const segSize int64 = 4096 + dirs := make([]string, 3) + for i := range dirs { + dirs[i] = t.TempDir() + engine, err := qwpSfNewCursorEngine(dirs[i], segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + } + + drainers := make([]*qwpSfOrphanDrainer, 0, len(dirs)) + for _, dir := range dirs { + drainer := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + nil, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainers = append(drainers, drainer) + require.NoError(t, pool.drainerPoolSubmit(context.Background(), drainer)) + } + pool.drainerPoolClose() + // Every submitted drainer must reach a terminal state — we + // don't strictly require Success since close grace might cut + // some off, but the outcome must not be PENDING. + for _, d := range drainers { + assert.NotEqual(t, qwpSfDrainOutcomePending, d.drainerOutcome()) + } + // Snapshot must be empty after close: completed drainers are + // pruned from the active list as their goroutines exit. + assert.Empty(t, pool.drainerPoolSnapshot()) +} + +// TestQwpSfDrainerPoolEnforcesConcurrencyCapAtRuntime proves the +// max_background_drainers cap is a *runtime* bound, not just a parsed +// config value: submitting more drainers than the cap must never run +// more than `cap` drainerRun bodies at once. The clientFactory is the +// observation point — it is invoked from inside drainerRun only after +// the goroutine has taken its semaphore slot, so the number of +// concurrent factory entries equals the number of concurrently +// running drainers. A factory that parks until the pool's master ctx +// is cancelled holds every slot occupied, so a cap-violating drainer +// (if the semaphore were missing) would show up as a (cap+1)th entry. +func TestQwpSfDrainerPoolEnforcesConcurrencyCapAtRuntime(t *testing.T) { + prevGrace := qwpSfDrainerPoolCloseGrace + qwpSfDrainerPoolCloseGrace = 50 * time.Millisecond + defer func() { qwpSfDrainerPoolCloseGrace = prevGrace }() + + const ( + maxConcurrent = 2 + total = 5 + ) + + var running atomic.Int32 + var peak atomic.Int32 + entered := make(chan struct{}, total) + + // Parks until the pool's master ctx is cancelled (drainerPoolClose). + blockingFactory := func(ctx context.Context, _ int) (*qwpTransport, error) { + cur := running.Add(1) + defer running.Add(-1) + for { + p := peak.Load() + if cur <= p || peak.CompareAndSwap(p, cur) { + break + } + } + entered <- struct{}{} + <-ctx.Done() + return nil, ctx.Err() + } + + pool := qwpSfNewDrainerPool(maxConcurrent) + + const segSize int64 = 4096 + drainers := make([]*qwpSfOrphanDrainer, total) + for i := range drainers { + dir := t.TempDir() + engine, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte{byte(i)}) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + + d := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + blockingFactory, + nil, + time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainers[i] = d + require.NoError(t, pool.drainerPoolSubmit(context.Background(), d)) + } + + // Exactly `maxConcurrent` drainers must reach the factory. + for i := 0; i < maxConcurrent; i++ { + select { + case <-entered: + case <-time.After(2 * time.Second): + t.Fatalf("only %d drainers entered the factory, want %d", i, maxConcurrent) + } + } + // No further drainer may enter while the first `maxConcurrent` + // hold their slots — the rest are parked on the semaphore. + select { + case <-entered: + t.Fatalf("a %dth drainer entered the factory: runtime cap not enforced", maxConcurrent+1) + case <-time.After(250 * time.Millisecond): + } + assert.LessOrEqual(t, peak.Load(), int32(maxConcurrent), + "at most %d drainers may run concurrently, observed peak %d", maxConcurrent, peak.Load()) + + // Close cancels the master ctx; parked factories unwind, the + // queued drainers never enter. The cap must still hold. + pool.drainerPoolClose() + assert.LessOrEqual(t, peak.Load(), int32(maxConcurrent), + "concurrency cap must hold across the full run, observed peak %d", peak.Load()) + for i, d := range drainers { + assert.NotEqual(t, qwpSfDrainOutcomePending, d.drainerOutcome(), + "drainer %d still pending after close", i) + } + assert.Empty(t, pool.drainerPoolSnapshot()) +} + +// Regression: a drainer parked inside clientFactory(ctx) — e.g. a +// long-running TCP dial / WS upgrade against a black-holed peer — +// must not survive past drainerPoolClose. The pool cancels its +// master ctx after the polite-stop grace; the dial unwinds; the +// drainer goroutine exits. +func TestQwpSfDrainerPoolCancelsBlockingDialOnClose(t *testing.T) { + prevGrace := qwpSfDrainerPoolCloseGrace + qwpSfDrainerPoolCloseGrace = 50 * time.Millisecond + defer func() { qwpSfDrainerPoolCloseGrace = prevGrace }() + + dir := t.TempDir() + engine, err := qwpSfNewCursorEngine(dir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("data")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + + dialEntered := make(chan struct{}, 1) + blockingFactory := func(ctx context.Context, _ int) (*qwpTransport, error) { + select { + case dialEntered <- struct{}{}: + default: + } + <-ctx.Done() + return nil, ctx.Err() + } + + pool := qwpSfNewDrainerPool(1) + drainer := qwpSfNewOrphanDrainer( + dir, 4096, qwpSfUnlimitedTotalBytes, + blockingFactory, + nil, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + require.NoError(t, pool.drainerPoolSubmit(context.Background(), drainer)) + + // Make sure the drainer is actually parked in the dial before + // we close — otherwise we'd be testing the polite-stop path. + select { + case <-dialEntered: + case <-time.After(2 * time.Second): + t.Fatal("drainer never entered clientFactory") + } + + closeDone := make(chan struct{}) + go func() { + pool.drainerPoolClose() + close(closeDone) + }() + select { + case <-closeDone: + case <-time.After(2 * time.Second): + t.Fatal("drainerPoolClose did not return after grace + ctx cancel") + } + + // Drainer must have exited cleanly as Stopped (not Failed) — + // a ctx-cancel during dial should NOT leave a .failed sentinel + // in the slot, since the slot is still recoverable. + assert.Equal(t, qwpSfDrainOutcomeStopped, drainer.drainerOutcome()) + _, statErr := os.Stat(filepath.Join(dir, qwpSfFailedSentinelName)) + assert.True(t, os.IsNotExist(statErr), "must not leave .failed sentinel on close-during-dial") + + // Active list must be pruned: drainer goroutine has exited. + assert.Empty(t, pool.drainerPoolSnapshot()) +} + +// TestQwpSfDrainerPoolBoundedOnUncancellableDrainer is a regression +// test for M15: a drainer wedged in I/O the master-ctx cancel cannot +// reach — modelled here by a clientFactory that ignores its ctx, the +// way drainerRun's engine-open flock / mmap / CRC scan does — must +// not make drainerPoolClose hang forever. After the polite grace and +// the post-cancel hard grace both elapse, close abandons the +// straggler and returns; the slot stays adoptable. +func TestQwpSfDrainerPoolBoundedOnUncancellableDrainer(t *testing.T) { + prevGrace := qwpSfDrainerPoolCloseGrace + prevHard := qwpSfDrainerPoolHardCloseGrace + qwpSfDrainerPoolCloseGrace = 50 * time.Millisecond + qwpSfDrainerPoolHardCloseGrace = 50 * time.Millisecond + defer func() { + qwpSfDrainerPoolCloseGrace = prevGrace + qwpSfDrainerPoolHardCloseGrace = prevHard + }() + + dir := t.TempDir() + engine, err := qwpSfNewCursorEngine(dir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("data")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + + // A factory that ignores its ctx stands in for a drainer wedged in + // I/O the master-ctx cancel cannot interrupt. + block := make(chan struct{}) + defer close(block) // release at test end so the goroutine unwinds + entered := make(chan struct{}, 1) + wedgeFactory := func(_ context.Context, _ int) (*qwpTransport, error) { + select { + case entered <- struct{}{}: + default: + } + <-block // ignores ctx + return nil, errors.New("released") + } + + pool := qwpSfNewDrainerPool(1) + drainer := qwpSfNewOrphanDrainer( + dir, 4096, qwpSfUnlimitedTotalBytes, + wedgeFactory, + nil, + time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + require.NoError(t, pool.drainerPoolSubmit(context.Background(), drainer)) + + select { + case <-entered: + case <-time.After(2 * time.Second): + t.Fatal("drainer never entered the factory") + } + + closeDone := make(chan struct{}) + go func() { + pool.drainerPoolClose() + close(closeDone) + }() + select { + case <-closeDone: + case <-time.After(2 * time.Second): + t.Fatal("drainerPoolClose hung on an un-cancellable drainer") + } + + // Abandoned, not joined: the goroutine is still parked in the + // factory, so it is still tracked and still Pending. Its slot is + // left intact (no .failed sentinel) for a future sender to adopt. + assert.NotEmpty(t, pool.drainerPoolSnapshot(), + "wedged drainer must still be tracked (abandoned, not joined)") + assert.Equal(t, qwpSfDrainOutcomePending, drainer.drainerOutcome()) + _, statErr := os.Stat(filepath.Join(dir, qwpSfFailedSentinelName)) + assert.True(t, os.IsNotExist(statErr), "must not quarantine an abandoned slot") +} + +func TestQwpSfDrainerPoolRejectsAfterClose(t *testing.T) { + pool := qwpSfNewDrainerPool(1) + pool.drainerPoolClose() + d := qwpSfNewOrphanDrainer(t.TempDir(), 4096, qwpSfUnlimitedTotalBytes, + nil, nil, time.Second, 10*time.Millisecond, 100*time.Millisecond) + err := pool.drainerPoolSubmit(context.Background(), d) + require.Error(t, err) + assert.Contains(t, err.Error(), "closed") +} + +// TestQwpSfDrainerUsesSharedTracker verifies the Phase 5 wiring: +// a drainer constructed with a shared tracker records its initial +// dial outcome onto that tracker (idx=0 becomes Healthy), so +// foreground PickNext observations are kept consistent across +// every caller drawing from the same connect-string addr= list. +func TestQwpSfDrainerUsesSharedTracker(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + dir := t.TempDir() + const segSize int64 = 4096 + { + engine, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("drainme")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + } + + tracker := newQwpHostTracker(1, "", qwpTargetAny) + drainer := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + tracker, + 1*time.Second, 10*time.Millisecond, 100*time.Millisecond, + ) + drainer.drainerRun(context.Background()) + require.Equal(t, qwpSfDrainOutcomeSuccess, drainer.drainerOutcome()) + + // The shared tracker must now show host 0 as Healthy — the + // drainer's bind landed there and reported success. + snap := tracker.snapshot() + assert.Equal(t, qwpHostHealthy, snap[0].state, + "shared tracker must reflect drainer's successful bind") +} + +func TestSfConfDrainOrphansEndToEnd(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + root := t.TempDir() + // Pre-populate an orphan slot with un-drained data. + orphanDir := filepath.Join(root, "old-sender") + require.NoError(t, os.MkdirAll(orphanDir, 0o755)) + { + engine, err := qwpSfNewCursorEngine(orphanDir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("orphaned-frame")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + } + + addr := strings.TrimPrefix(srv.URL, "http://") + confStr := strings.Join([]string{ + "ws::addr=" + addr, + "sf_dir=" + root, + "sender_id=foreground", + "drain_orphans=on", + "max_background_drainers=2", + "close_flush_timeout_millis=2000;", + }, ";") + ls, err := LineSenderFromConf(context.Background(), confStr) + require.NoError(t, err) + + // Wait briefly for the drainer to consume the orphan frame. + require.Eventually(t, func() bool { + entries, _ := os.ReadDir(orphanDir) + for _, e := range entries { + if filepath.Ext(e.Name()) == ".sfa" { + return false + } + } + return true + }, 5*time.Second, 50*time.Millisecond) + + require.NoError(t, ls.Close(context.Background())) + // At least the orphan frame must have reached the server. + assert.GreaterOrEqual(t, srv.totalFramesReceived.Load(), int64(1)) +} + +// Regression: a server that completes the WS upgrade and accepts our +// frames but never ACKs and never drops the connection must not wedge +// the drainer forever. Without a no-progress watchdog the drain loop +// spins on the poll interval indefinitely; on Close it would exit +// Stopped (no .failed sentinel), so every future process start would +// re-adopt the same slot in full — an unbounded re-adoption livelock. +// The watchdog must quarantine the slot with a .failed sentinel after +// reconnectMaxDuration of zero ACK progress on a live connection. +func TestQwpSfDrainerMarksFailedWhenConnectedButNeverAcked(t *testing.T) { + // silentAcks: read frames forever, never ACK, keep the + // connection open — exactly the wedged-but-connected scenario. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{silentAcks: true}) + defer srv.Close() + + dir := t.TempDir() + const segSize int64 = 4096 + { + engine, err := qwpSfNewCursorEngine(dir, segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + _, err = engine.engineAppendBlocking(context.Background(), []byte("data")) + require.NoError(t, err) + require.NoError(t, engine.engineClose()) + } + + // reconnectMaxDuration doubles as the no-progress budget. Keep it + // short so the watchdog fires quickly; the connection stays up + // the whole time, so the (separately bounded) reconnect path is + // never entered and cannot mask the watchdog. + drainer := qwpSfNewOrphanDrainer( + dir, segSize, qwpSfUnlimitedTotalBytes, + qwpSfDialFor(srv), + nil, + 300*time.Millisecond, 10*time.Millisecond, 50*time.Millisecond, + ) + + done := make(chan struct{}) + go func() { + drainer.drainerRun(context.Background()) + close(done) + }() + select { + case <-done: + case <-time.After(15 * time.Second): + t.Fatal("drainer never terminated — no-progress watchdog missing (livelock)") + } + + assert.Equal(t, qwpSfDrainOutcomeFailed, drainer.drainerOutcome()) + body, err := os.ReadFile(filepath.Join(dir, qwpSfFailedSentinelName)) + require.NoError(t, err) + assert.Contains(t, string(body), "no drain progress") + // The slot now carries .sfa + .failed, so it is no longer a + // re-adoption candidate: a future process start won't re-adopt it. + assert.False(t, qwpSfIsCandidateOrphan(dir), + "slot must be quarantined (not a re-adoption candidate) after the watchdog fires") +} diff --git a/qwp_sf_ring.go b/qwp_sf_ring.go new file mode 100644 index 00000000..829cae94 --- /dev/null +++ b/qwp_sf_ring.go @@ -0,0 +1,670 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "sync/atomic" +) + +// qwpSfRing append/seal sentinels. +const ( + // qwpSfBackpressureNoSpare: append failed because no hot spare was + // available to rotate into. The caller spins / parks; the segment + // manager polls and provisions a spare. + qwpSfBackpressureNoSpare int64 = -1 + // qwpSfPayloadTooLarge: append failed because the payload doesn't + // fit in a fresh segment. Terminal for that frame. + qwpSfPayloadTooLarge int64 = -2 +) + +// qwpSfErrPayloadTooLarge surfaces qwpSfPayloadTooLarge to the caller +// as an error value, avoiding magic-number comparisons in user code. +// +//lint:ignore ST1012 prefix kept for grouping with other qwpSf* errors +var qwpSfErrPayloadTooLarge = errors.New("qwp/sf: payload too large for segment") + +// qwpSfErrRingClosed is returned from installHotSpare when the ring +// has been closed since the manager started provisioning the spare. +// +//lint:ignore ST1012 prefix kept for grouping with other qwpSf* errors +var qwpSfErrRingClosed = errors.New("qwp/sf: ring closed") + +// qwpSfSegmentRing is a chain of qwpSfSegments presented to the user +// thread as one logical append-only log keyed by frame sequence +// number (FSN). Owns segment lifecycle: rotation when the active +// segment fills, ACK-driven trim of the oldest sealed segments. +// +// Built for the cursor engine's split-brain threading: +// - Producer goroutine (single user goroutine): appendOrFsn, +// installHotSpare consumer side, publishedFsn. +// - I/O goroutine: publishedFsn (read-only), acknowledge (single +// writer), nextSealedAfter, firstSealed, findSegmentContaining. +// - Segment-manager goroutine: needsHotSpare, installHotSpare, +// drainTrimmable on its own cadence. +// +// Backpressure model: appendOrFsn returns qwpSfBackpressureNoSpare +// when the active is full and no spare is available. The caller (the +// engine) is expected to spin-park until the segment manager catches +// up, OR until acknowledge advances ackedFsn far enough that the +// manager can recycle a sealed segment. +type qwpSfSegmentRing struct { + maxBytesPerSegment int64 + signalAtBytes int64 + + // active and hotSpare are accessed cross-thread. Producer writes; + // I/O thread and manager read. atomic.Pointer mirrors the Java + // volatile reference contract. + active atomic.Pointer[qwpSfSegment] + hotSpare atomic.Pointer[qwpSfSegment] + + // ackedFsn and publishedFsn are atomic int64s shared with readers. + // Both start at -1 (no ACK / no publish yet). + ackedFsn atomic.Int64 + publishedFsn atomic.Int64 + + // ackNotify is a broadcast channel that acknowledge closes and + // replaces each time it advances ackedFsn, so a blocked waiter + // (AwaitAckedFsn) wakes immediately instead of polling. Lazily + // created by the first subscriber and nil whenever nobody is + // waiting, so an ACK with no waiter costs only the mutex. Guarded + // by ackNotifyMu; lives off the producer hot path (acknowledge runs + // on the I/O goroutine). + ackNotifyMu sync.Mutex + ackNotify chan struct{} + + // nextSeq is the FSN that appendOrFsn will assign next. + // Producer-only mutator (single-threaded), but the segment + // manager goroutine reads it via nextSeqHint to seed a fresh + // spare's baseSeq, so the field has to be atomic to avoid a + // torn-read race under -race. + nextSeq atomic.Int64 + + // mu protects sealedSegments and serialises against close. It also + // covers the producer's mutation when adding a sealed segment to + // the list. + mu sync.Mutex + sealedSegments []*qwpSfSegment + closed bool + + // managerWakeup is invoked by the producer on rotation or + // high-water-mark crossings to ask the manager to provision a + // fresh spare immediately. Producer-thread-only field; set once + // before producing starts. + managerWakeup func() + // sendLoopWakeup is invoked by the producer after every publish + // so an idle send loop reacts immediately instead of polling. + // Producer-thread-only field; set once before producing starts. + // nil in unit tests that drive the ring without a send loop. + sendLoopWakeup func() + // wakeupRequestedForActive coalesces multiple high-water-mark + // crossings into a single backup manager unpark per active segment. + // Set when that backup wakeup fires; reset on rotation so each + // freshly promoted active segment gets its own one-shot backup. + wakeupRequestedForActive bool +} + +// qwpSfNewSegmentRing creates a ring with the given segment cap and an +// already-prepared initial active segment. The initial segment must +// be empty (just headers, frameCount == 0); typically supplied by the +// engine at startup. +func qwpSfNewSegmentRing(initialActive *qwpSfSegment, maxBytesPerSegment int64) *qwpSfSegmentRing { + if initialActive == nil { + panic("qwp/sf: initialActive must not be nil") + } + r := &qwpSfSegmentRing{ + maxBytesPerSegment: maxBytesPerSegment, + signalAtBytes: (maxBytesPerSegment >> 2) * 3, + } + r.active.Store(initialActive) + // Initialize counters from the segment's recovery state. For a + // fresh segment, frameCount == 0, so nextSeq == baseSeq and + // publishedFsn == nextSeq - 1 == -1 (or baseSeq-1 for a + // rebased-recovered segment). + frameCount := initialActive.segmentFrameCount() + r.nextSeq.Store(initialActive.segmentBaseSeq() + frameCount) + if frameCount > 0 { + r.publishedFsn.Store(r.nextSeq.Load() - 1) + } else { + r.publishedFsn.Store(-1) + } + r.ackedFsn.Store(-1) + return r +} + +// qwpSfOpenRing recovers a ring from segments already on disk in +// sfDir. Used at sender startup when the user's previous session +// left durable but not-yet-acked frames behind. Walks every *.sfa +// file in the directory, opens each via qwpSfOpenSegment, and +// arranges them by baseSeq: +// - Highest-baseSeq segment becomes the active. +// - All others become sealed segments awaiting ACK and trim. +// +// Returns nil if the directory is empty or contains no recognizable +// .sfa files. A single bad-magic file is silently skipped (a stray +// unrelated file in the SF dir shouldn't take the whole sender +// down). A failure to open an otherwise-valid segment is fatal — the +// caller's data integrity depends on every segment being readable. +func qwpSfOpenRing(sfDir string, maxBytesPerSegment int64) (*qwpSfSegmentRing, error) { + if _, err := os.Stat(sfDir); err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, fmt.Errorf("qwp/sf: stat %s: %w", sfDir, err) + } + entries, err := os.ReadDir(sfDir) + if err != nil { + return nil, fmt.Errorf("qwp/sf: read %s: %w", sfDir, err) + } + var opened []*qwpSfSegment + // Defense-in-depth: anything escaping the recovery body — a panic + // from native munmap, an OOM from a future concurrent allocator, + // the FSN-gap error below — must close every recovered fd+mmap + // before propagating. After the success path opened is reassigned + // to drop the active segment (transferred to the ring) and the + // sealed segments (transferred to ring.sealedSegments), so this + // cleanup is a no-op once we reach the bottom. + defer func() { + for _, s := range opened { + _ = s.close() + } + }() + for _, e := range entries { + name := e.Name() + if e.IsDir() || !strings.HasSuffix(name, ".sfa") { + continue + } + path := filepath.Join(sfDir, name) + seg, err := qwpSfOpenSegment(path) + if err != nil { + // Stray file with the .sfa extension but bad header / + // unreadable: skip rather than fail the recovery. The + // engine will log when it surfaces this case via the + // returned ring. + continue + } + // Filter out empty leftovers — typically hot-spare segments + // the manager pre-allocated for a prior session that never + // got rotated into active. They carry the provisional + // baseSeq=0 and frameCount=0, which would otherwise collide + // with the real baseSeq=0 segment and trip the contiguity + // check below. No data to recover; close and unlink. + // + // CAUTION: only unlink when the file is genuinely empty past + // the header. If frame[0] failed CRC (bit-rot, partial-page- + // write at crash, etc.) but valid frames followed, scanFrames + // returns lastGood=HEADER_SIZE and frameCount=0 — yet + // tornTailBytes is non-zero. Treating that as "empty hot + // spare" would silently destroy every surviving frame. + // Quarantine to .corrupt instead so a postmortem can + // recover what's left. + if seg.segmentFrameCount() == 0 { + torn := seg.segmentTornTailBytes() + _ = seg.close() + if torn > 0 { + _ = os.Rename(path, path+".corrupt") + } else { + _ = os.Remove(path) + } + continue + } + opened = append(opened, seg) + } + if len(opened) == 0 { + return nil, nil + } + sort.Slice(opened, func(i, j int) bool { + // Unsigned comparison to match Java's Long.compareUnsigned — + // future-proofs against baseSeq wrapping into negatives. + return uint64(opened[i].segmentBaseSeq()) < uint64(opened[j].segmentBaseSeq()) + }) + // Sanity: the recovered segments must form a contiguous FSN + // range. Detect gaps so a partial-write/manual-deletion mishap + // doesn't silently produce duplicate or missing FSNs. The deferred + // cleanup above handles closing on the error path. + for i := 1; i < len(opened); i++ { + prev := opened[i-1] + curr := opened[i] + expected := prev.segmentBaseSeq() + prev.segmentFrameCount() + if curr.segmentBaseSeq() != expected { + return nil, fmt.Errorf( + "qwp/sf: FSN gap in recovered segments: prev baseSeq=%d frameCount=%d expected next baseSeq=%d but got %d", + prev.segmentBaseSeq(), prev.segmentFrameCount(), expected, curr.segmentBaseSeq()) + } + } + // The newest segment becomes the active. Even if it's full, that's + // OK: the next appendOrFsn returns BACKPRESSURE_NO_SPARE, the + // manager installs a hot spare, the producer rotates. + last := len(opened) - 1 + active := opened[last] + sealed := opened[:last] + r := qwpSfNewSegmentRing(active, maxBytesPerSegment) + r.sealedSegments = sealed + // Ownership transferred to the ring — clear opened so the deferred + // cleanup leaves the recovered segments alone. + opened = nil + return r, nil +} + +// segmentRingAckedFsn returns the highest FSN that the server has +// ACK'd. Read by the segment manager to decide which sealed segments +// are safe to munmap + unlink. +func (r *qwpSfSegmentRing) segmentRingAckedFsn() int64 { + return r.ackedFsn.Load() +} + +// acknowledge advances the ACK cursor. seq is cumulative — the +// server has confirmed every FSN up to and including this value. +// Idempotent: a second call with the same or smaller value is a +// no-op. +// +// Defense-in-depth: clamp at publishedFsn so a malformed/poisoned +// server response with a bogus wireSeq cannot move ackedFsn past +// what the producer has actually written. Without the clamp, the +// segment manager could trim segments the I/O thread is still +// iterating and SEGV the process on the next mmap read. +func (r *qwpSfSegmentRing) acknowledge(seq int64) { + pub := r.publishedFsn.Load() + if seq > pub { + seq = pub + } + for { + cur := r.ackedFsn.Load() + if seq <= cur { + return + } + if r.ackedFsn.CompareAndSwap(cur, seq) { + // ackedFsn moved — wake any AwaitAckedFsn waiters. Done after + // the store so a woken waiter that re-reads ackedFsn observes + // the new value (close happens-before the receive that wakes + // it). + r.notifyAckAdvance() + return + } + } +} + +// segmentRingAckNotify returns a channel that is closed the next time +// acknowledge advances ackedFsn. The contract for a no-lost-wakeup +// wait is: subscribe (call this) first, then read segmentRingAckedFsn, +// then block on the returned channel — acknowledge's atomic store of +// the new FSN precedes its close of this channel, so any advance that +// races the FSN read still wakes the waiter via the closed channel. +func (r *qwpSfSegmentRing) segmentRingAckNotify() <-chan struct{} { + r.ackNotifyMu.Lock() + defer r.ackNotifyMu.Unlock() + if r.ackNotify == nil { + r.ackNotify = make(chan struct{}) + } + return r.ackNotify +} + +// notifyAckAdvance wakes every current ack-notify subscriber and clears +// the channel so the next subscriber lazily installs a fresh one. A +// no-op (just the mutex) when nobody is waiting, which is the common +// case — only AwaitAckedFsn subscribes. +func (r *qwpSfSegmentRing) notifyAckAdvance() { + r.ackNotifyMu.Lock() + ch := r.ackNotify + r.ackNotify = nil + r.ackNotifyMu.Unlock() + if ch != nil { + close(ch) + } +} + +// appendOrFsn is the single-producer append path. Reserves an FSN, +// writes the frame into the active segment, advances publishedFsn. +// Returns the assigned FSN on success, or one of the +// qwpSfBackpressureNoSpare / qwpSfPayloadTooLarge sentinels on +// failure. +// +// Rotation is automatic: when the active is full, the hot spare (if +// installed) is promoted, the previous active joins the sealed list, +// and the segment manager is signaled (implicitly by polling, plus +// explicitly via managerWakeup) to prepare the next spare. +func (r *qwpSfSegmentRing) appendOrFsn(payload []byte) int64 { + active := r.active.Load() + off, err := active.tryAppend(payload) + if err != nil { + if !errors.Is(err, qwpSfErrSegmentFull) { + // Unexpected error from tryAppend (negative len, etc.). + // Surface as PAYLOAD_TOO_LARGE — the only programmatic + // failure mode the producer can act on. + return qwpSfPayloadTooLarge + } + // Active is full. Try to rotate. + spare := r.hotSpare.Load() + if spare == nil { + return qwpSfBackpressureNoSpare + } + // Pin the spare's baseSeq to whatever the active's nextSeq + // actually is right now. This is the right moment because + // (a) the active is full so its frameCount is stable, and + // (b) the spare hasn't been appended to yet (rebaseSeq + // enforces that). The segment manager's earlier guess at + // baseSeq is irrelevant. + actualBase := active.segmentBaseSeq() + active.segmentFrameCount() + if rebaseErr := spare.rebaseSeq(actualBase); rebaseErr != nil { + // Spare already has appended frames — programming error. + // Surface as PAYLOAD_TOO_LARGE (the most actionable + // failure code) so the user sees a clear error rather + // than silent corruption. + return qwpSfPayloadTooLarge + } + // Mutate sealedSegments under the same mutex used by the + // snapshot accessors — the I/O thread reads through that + // path and must not see a half-resized slice. + r.mu.Lock() + r.sealedSegments = append(r.sealedSegments, active) + r.mu.Unlock() + r.active.Store(spare) + r.hotSpare.Store(nil) + // The freshly promoted active has no spare behind it yet, so + // re-arm its one-shot backup wakeup: a later high-water-mark + // crossing on this new segment must be able to nudge the manager + // again if the next spare is slow to arrive. The unconditional + // wakeup just below is the separate "make the next spare" signal. + r.wakeupRequestedForActive = false + // Fresh active just consumed the spare → ask the manager to + // start making the next one immediately. + if w := r.managerWakeup; w != nil { + w() + } + off, err = spare.tryAppend(payload) + if err != nil { + // Doesn't fit even in a fresh segment — payload is + // genuinely too big. + return qwpSfPayloadTooLarge + } + } else if !r.wakeupRequestedForActive && + r.hotSpare.Load() == nil && + r.managerWakeup != nil && + active.publishedOffset() >= r.signalAtBytes { + // Backup signal: we're past the high-water mark and still + // don't have a spare. Fire once per active segment. + r.wakeupRequestedForActive = true + r.managerWakeup() + } + _ = off // offset is not used by callers; kept for parity with the Java return. + fsn := r.nextSeq.Load() + r.nextSeq.Store(fsn + 1) + r.publishedFsn.Store(fsn) + // Ring the send loop's doorbell after publishedFsn is visible so + // a woken loop is guaranteed to observe this frame (the atomic + // store happens-before the channel send). Non-blocking and + // alloc-free; nil in send-loop-less unit tests. + if w := r.sendLoopWakeup; w != nil { + w() + } + return fsn +} + +// segmentRingClose releases all segments and marks the ring closed. +// Subsequent installHotSpare calls return qwpSfErrRingClosed; the +// active segment is closed last so any reader that captured a +// reference can finish reading before unmap. +func (r *qwpSfSegmentRing) segmentRingClose() error { + r.mu.Lock() + r.closed = true + sealed := r.sealedSegments + r.sealedSegments = nil + r.mu.Unlock() + + var firstErr error + if a := r.active.Swap(nil); a != nil { + if err := a.close(); err != nil && firstErr == nil { + firstErr = err + } + } + if hs := r.hotSpare.Swap(nil); hs != nil { + if err := hs.close(); err != nil && firstErr == nil { + firstErr = err + } + } + for _, s := range sealed { + if s == nil { + continue + } + if err := s.close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// drainTrimmable removes and returns sealed segments whose every +// frame has been ACK'd (i.e. baseSeq + frameCount - 1 <= ackedFsn). +// Caller takes ownership and is responsible for close() + unlinking +// the file. Called by the segment manager off the hot path. Returns +// nil when nothing is eligible (avoids slice allocation in the +// steady state where most polls are no-ops). +func (r *qwpSfSegmentRing) drainTrimmable() []*qwpSfSegment { + r.mu.Lock() + defer r.mu.Unlock() + acked := r.ackedFsn.Load() + var out []*qwpSfSegment + // Sealed segments are in baseSeq order, oldest first; once we hit + // one that isn't fully acked, none of the later ones can be either. + for len(r.sealedSegments) > 0 { + s := r.sealedSegments[0] + lastSeq := s.segmentBaseSeq() + s.segmentFrameCount() - 1 + if lastSeq > acked { + break + } + out = append(out, s) + r.sealedSegments = r.sealedSegments[1:] + } + return out +} + +// getActiveSegment returns the active segment — exposed for the I/O +// thread's "send next batch" path. Returns nil after the ring has +// been closed. +func (r *qwpSfSegmentRing) getActiveSegment() *qwpSfSegment { + return r.active.Load() +} + +// getSealedSegments returns a direct view of sealed segments +// (oldest first). NOT thread-safe — use only from the producer +// goroutine, or alongside a lock that excludes concurrent rotation. +// Cross-thread readers (typically the I/O loop) should use +// snapshotSealedSegments instead. +func (r *qwpSfSegmentRing) getSealedSegments() []*qwpSfSegment { + return r.sealedSegments +} + +// snapshotSealedSegments copies references into the caller-supplied +// target slice (oldest first, packed left). Returns the number of +// references copied. If target is too small, copies the first +// len(target) references and returns -1 as a signal that the caller +// needs to grow the buffer and retry. +// +// Mutex-protected against rotation. Cost is one Lock/Unlock per +// call, paid by the I/O loop at most once per tick. +func (r *qwpSfSegmentRing) snapshotSealedSegments(target []*qwpSfSegment) int { + r.mu.Lock() + defer r.mu.Unlock() + n := len(r.sealedSegments) + if n > len(target) { + copy(target, r.sealedSegments[:len(target)]) + return -1 + } + copy(target, r.sealedSegments) + return n +} + +// nextSealedAfter returns the sealed segment whose baseSeq +// immediately follows current.baseSeq, or nil if no such segment +// exists. Used by the I/O loop to walk forward through the sealed +// list one segment at a time without snapshotting the whole list — +// important when the producer outpaces the I/O thread. +// +// Identity match is intentionally avoided: we compare baseSeq so the +// loop is robust against current having been trimmed out from under +// us — we still return the next segment in baseSeq order rather than +// failing. +func (r *qwpSfSegmentRing) nextSealedAfter(current *qwpSfSegment) *qwpSfSegment { + r.mu.Lock() + defer r.mu.Unlock() + currentBase := current.segmentBaseSeq() + for _, s := range r.sealedSegments { + if s.segmentBaseSeq() > currentBase { + return s + } + } + return nil +} + +// firstSealed returns the oldest sealed segment, or nil if the +// sealed list is empty. +func (r *qwpSfSegmentRing) firstSealed() *qwpSfSegment { + r.mu.Lock() + defer r.mu.Unlock() + if len(r.sealedSegments) > 0 { + return r.sealedSegments[0] + } + return nil +} + +// sealedSegmentCount returns the number of sealed segments under the +// ring mutex. Thread-safe sibling of getSealedSegments for callers +// (e.g. tests) that observe the ring while the segment manager +// concurrently trims via drainTrimmable. +func (r *qwpSfSegmentRing) sealedSegmentCount() int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.sealedSegments) +} + +// findSegmentContaining returns the segment whose published frame +// range covers fsn, or nil if no segment currently holds it. +// Walks sealed first (oldest → newest) then the active. +func (r *qwpSfSegmentRing) findSegmentContaining(fsn int64) *qwpSfSegment { + r.mu.Lock() + defer r.mu.Unlock() + for _, s := range r.sealedSegments { + base := s.segmentBaseSeq() + if fsn >= base && fsn < base+s.segmentFrameCount() { + return s + } + } + a := r.active.Load() + if a != nil { + base := a.segmentBaseSeq() + if fsn >= base && fsn < base+a.segmentFrameCount() { + return a + } + } + return nil +} + +// installHotSpare parks a freshly-created spare. The producer +// consumes it on its next rotation. Returns an error if a spare is +// already installed (the manager should have polled needsHotSpare +// first; double-install is a programming error), or if the ring has +// been closed since the manager started provisioning the spare. The +// latter is a benign race — the manager's catch block closes the +// unused spare and unlinks its file. +func (r *qwpSfSegmentRing) installHotSpare(spare *qwpSfSegment) error { + if spare == nil { + return errors.New("qwp/sf: spare must not be nil") + } + r.mu.Lock() + defer r.mu.Unlock() + if r.closed { + return qwpSfErrRingClosed + } + if r.hotSpare.Load() != nil { + return errors.New("qwp/sf: hot spare already installed") + } + r.hotSpare.Store(spare) + return nil +} + +// totalSegmentBytes returns the sum of all segment sizes the ring +// currently owns: active + hot spare (if installed) + every sealed +// segment. Used by qwpSfSegmentManager to seed its totalBytes +// accounting at register time and reverse it at deregister time. +func (r *qwpSfSegmentRing) totalSegmentBytes() int64 { + r.mu.Lock() + defer r.mu.Unlock() + var total int64 + if a := r.active.Load(); a != nil { + total += a.segmentSize() + } + if hs := r.hotSpare.Load(); hs != nil { + total += hs.segmentSize() + } + for _, s := range r.sealedSegments { + if s != nil { + total += s.segmentSize() + } + } + return total +} + +// setManagerWakeup registers a callback the producer goroutine will +// invoke when a hot spare is needed — either right after a rotation +// has consumed the previous spare, or when the active segment +// crosses the 75% high-water mark while no spare is installed. Set +// once before producing starts; idempotent re-set is allowed but not +// thread-safe. +func (r *qwpSfSegmentRing) setManagerWakeup(wakeup func()) { + r.managerWakeup = wakeup +} + +// setSendLoopWakeup installs the callback appendOrFsn rings after +// every publish so the send loop drains promptly without polling. +// Set once before producing starts; not thread-safe. +func (r *qwpSfSegmentRing) setSendLoopWakeup(wakeup func()) { + r.sendLoopWakeup = wakeup +} + +// needsHotSpare reports whether the segment manager should provision +// a fresh spare for this ring. +func (r *qwpSfSegmentRing) needsHotSpare() bool { + return r.hotSpare.Load() == nil +} + +// nextSeqHint returns the next FSN appendOrFsn will assign — useful +// for the segment manager to know what baseSeq to stamp the next +// spare with (provisional; rebased at rotation). +func (r *qwpSfSegmentRing) nextSeqHint() int64 { + return r.nextSeq.Load() +} + +// segmentRingPublishedFsn returns the highest FSN whose frame is +// fully written and visible to consumers. Returns -1 when nothing +// has been appended yet. +func (r *qwpSfSegmentRing) segmentRingPublishedFsn() int64 { + return r.publishedFsn.Load() +} diff --git a/qwp_sf_ring_test.go b/qwp_sf_ring_test.go new file mode 100644 index 00000000..df7a1136 --- /dev/null +++ b/qwp_sf_ring_test.go @@ -0,0 +1,430 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfRingFreshHasNoPublishedFsn(t *testing.T) { + seg, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(seg, 4096) + defer func() { _ = r.segmentRingClose() }() + + assert.Equal(t, int64(-1), r.segmentRingPublishedFsn()) + assert.Equal(t, int64(-1), r.segmentRingAckedFsn()) + assert.Equal(t, int64(0), r.nextSeqHint()) + assert.True(t, r.needsHotSpare()) +} + +func TestQwpSfRingAppendAdvancesPublishedFsn(t *testing.T) { + seg, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(seg, 4096) + defer func() { _ = r.segmentRingClose() }() + + for i := int64(0); i < 5; i++ { + fsn := r.appendOrFsn([]byte("frame")) + assert.Equal(t, i, fsn, "iteration %d", i) + } + assert.Equal(t, int64(4), r.segmentRingPublishedFsn()) + assert.Equal(t, int64(5), r.nextSeqHint()) +} + +func TestQwpSfRingBackpressureWhenNoSpare(t *testing.T) { + const segSize int64 = 64 + seg, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(seg, segSize) + defer func() { _ = r.segmentRingClose() }() + + payload := []byte("12345678") // 8 bytes payload, 16 byte total framing + // Fill the active until tryAppend refuses. + for { + fsn := r.appendOrFsn(payload) + if fsn == qwpSfBackpressureNoSpare { + return + } + require.GreaterOrEqual(t, fsn, int64(0)) + } +} + +func TestQwpSfRingRotatesIntoHotSpare(t *testing.T) { + const segSize int64 = 64 + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + // Pre-install a spare. + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + assert.False(t, r.needsHotSpare()) + + // Fill the first segment until the next append rotates. + payload := make([]byte, 16) // 24 bytes total framing + rotated := false + expectedNextFsn := int64(0) + for !rotated { + fsn := r.appendOrFsn(payload) + require.NotEqual(t, qwpSfBackpressureNoSpare, fsn, "needed multiple rotations") + require.NotEqual(t, qwpSfPayloadTooLarge, fsn) + assert.Equal(t, expectedNextFsn, fsn) + expectedNextFsn++ + // Check whether rotation has happened: getActiveSegment now + // returns the spare and sealed list contains the original. + if r.getActiveSegment() == spare { + rotated = true + } + } + // First segment should be in sealed list. + sealed := r.getSealedSegments() + require.Len(t, sealed, 1) + assert.Equal(t, first, sealed[0]) + // Hot spare should be cleared. + assert.True(t, r.needsHotSpare()) +} + +// TestQwpSfRingBackupWakeupRearmsPerActiveSegment pins the contract +// that the high-water-mark backup wakeup nudges the segment manager +// once per active segment: every freshly promoted active must re-arm +// it so a stalled spare provision on the new segment can still be +// rescued. A latch that survived rotation would fire the backup only +// once over the ring's whole lifetime. +func TestQwpSfRingBackupWakeupRearmsPerActiveSegment(t *testing.T) { + // 512-byte segments put the 75% mark at 384, leaving several + // 32-byte frames of room before the segment fills, so the active + // crosses its HWM well before it rotates. + const segSize int64 = 512 + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + var wakeups int + r.managerWakeup = func() { wakeups++ } + + payload := make([]byte, 24) // 32 bytes on the wire with the frame header + + // drivePastHwm appends until the active is past its high-water mark + // (plus one more to prove repeated crossings coalesce), asserting no + // rotation or backpressure happens along the way. + drivePastHwm := func() { + for r.getActiveSegment().publishedOffset() < r.signalAtBytes { + fsn := r.appendOrFsn(payload) + require.GreaterOrEqual(t, fsn, int64(0), "unexpected backpressure/oversize before HWM") + } + require.GreaterOrEqual(t, r.appendOrFsn(payload), int64(0)) + } + + // First active segment, no spare staged: the backup fires exactly + // once however many frames land past the mark. + drivePastHwm() + require.Equal(t, 1, wakeups, "first active should fire one backup wakeup") + require.True(t, r.wakeupRequestedForActive) + + // Stage a spare and fill the rest so the next full-segment append + // rotates into it. Rotation must re-arm the per-segment backup. + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + for r.getActiveSegment() != spare { + fsn := r.appendOrFsn(payload) + require.NotEqual(t, qwpSfBackpressureNoSpare, fsn) + require.NotEqual(t, qwpSfPayloadTooLarge, fsn) + } + require.False(t, r.wakeupRequestedForActive, "rotation must re-arm the backup wakeup") + + // Isolate the second active segment, then drive it past its own HWM + // (still no spare). The backup must fire again — the latched-flag bug + // suppressed this entirely. + wakeups = 0 + drivePastHwm() + require.Equal(t, 1, wakeups, "freshly promoted active must re-fire its backup wakeup") +} + +func TestQwpSfRingTrimsAckedSegments(t *testing.T) { + // Each segment fits exactly two minimal frames (16-byte payloads, + // 8-byte envelopes). 24 (header) + 2*(8+16) = 72. + const segSize int64 = 72 + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + + payload := make([]byte, 16) + // Three appends: two land in the first active, the third forces + // rotation into the spare. + for i := 0; i < 3; i++ { + fsn := r.appendOrFsn(payload) + require.GreaterOrEqual(t, fsn, int64(0), "iteration %d", i) + } + sealed := r.getSealedSegments() + require.Len(t, sealed, 1) + lastSeqInFirst := sealed[0].segmentBaseSeq() + sealed[0].segmentFrameCount() - 1 + r.acknowledge(lastSeqInFirst) + + trim := r.drainTrimmable() + require.Len(t, trim, 1) + assert.Equal(t, sealed[0], trim[0]) + assert.Len(t, r.getSealedSegments(), 0) + for _, s := range trim { + _ = s.close() + } +} + +func TestQwpSfRingSnapshotSealedSegments(t *testing.T) { + const segSize int64 = 72 + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + + // Three appends → one segment sealed, one active. + for i := 0; i < 3; i++ { + _ = r.appendOrFsn(make([]byte, 16)) + } + target := make([]*qwpSfSegment, 4) + n := r.snapshotSealedSegments(target) + assert.Equal(t, 1, n) + assert.NotNil(t, target[0]) + + // Too-small target returns -1 to signal "buffer too small". + tiny := make([]*qwpSfSegment, 0) + assert.Equal(t, -1, r.snapshotSealedSegments(tiny)) +} + +func TestQwpSfRingFindSegmentContaining(t *testing.T) { + const segSize int64 = 72 // exactly two minimal frames + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + + payload := make([]byte, 16) + var fsns []int64 + for i := 0; i < 3; i++ { + fsns = append(fsns, r.appendOrFsn(payload)) + } + seg := r.findSegmentContaining(fsns[0]) + require.NotNil(t, seg) + assert.Equal(t, first, seg) + seg = r.findSegmentContaining(fsns[len(fsns)-1]) + require.NotNil(t, seg) + assert.Equal(t, spare, seg) + assert.Nil(t, r.findSegmentContaining(999)) +} + +func TestQwpSfRingTotalSegmentBytes(t *testing.T) { + const segSize int64 = 64 + first, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, segSize) + defer func() { _ = r.segmentRingClose() }() + + assert.Equal(t, segSize, r.totalSegmentBytes()) + spare, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare)) + assert.Equal(t, segSize*2, r.totalSegmentBytes()) +} + +func TestQwpSfRingInstallHotSpareRejectsDouble(t *testing.T) { + first, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, 4096) + defer func() { _ = r.segmentRingClose() }() + + spare1, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + require.NoError(t, r.installHotSpare(spare1)) + + spare2, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + err = r.installHotSpare(spare2) + require.Error(t, err) + _ = spare2.close() +} + +func TestQwpSfRingInstallHotSpareRejectsAfterClose(t *testing.T) { + first, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(first, 4096) + require.NoError(t, r.segmentRingClose()) + + spare, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + err = r.installHotSpare(spare) + assert.ErrorIs(t, err, qwpSfErrRingClosed) + _ = spare.close() +} + +func TestQwpSfRingOpenExistingNilOnEmpty(t *testing.T) { + dir := t.TempDir() + r, err := qwpSfOpenRing(dir, 4096) + require.NoError(t, err) + assert.Nil(t, r) +} + +func TestQwpSfRingOpenExistingRecoversInOrder(t *testing.T) { + dir := t.TempDir() + + // Create three segments with frames. + for _, base := range []int64{0, 5, 10} { + path := filepath.Join(dir, "sf-"+formatHex16(uint64(base))+".sfa") + seg, err := qwpSfCreateSegment(path, base, 4096) + require.NoError(t, err) + for i := 0; i < 5; i++ { + _, err := seg.tryAppend([]byte{byte(base), byte(i)}) + require.NoError(t, err) + } + require.NoError(t, seg.close()) + } + + r, err := qwpSfOpenRing(dir, 4096) + require.NoError(t, err) + require.NotNil(t, r) + defer func() { _ = r.segmentRingClose() }() + + // Highest baseSeq becomes active; other two go into sealed. + active := r.getActiveSegment() + require.NotNil(t, active) + assert.Equal(t, int64(10), active.segmentBaseSeq()) + sealed := r.getSealedSegments() + require.Len(t, sealed, 2) + assert.Equal(t, int64(0), sealed[0].segmentBaseSeq()) + assert.Equal(t, int64(5), sealed[1].segmentBaseSeq()) + // Counters should reflect 3 segments × 5 frames = 15 frames total + // = next FSN 15. + assert.Equal(t, int64(15), r.nextSeqHint()) + assert.Equal(t, int64(14), r.segmentRingPublishedFsn()) +} + +func TestQwpSfRingOpenExistingRejectsFsnGap(t *testing.T) { + dir := t.TempDir() + // Create two segments with non-contiguous FSN ranges. + for _, c := range []struct { + base int64 + frames int + }{ + {base: 0, frames: 5}, + {base: 100, frames: 5}, + } { + path := filepath.Join(dir, "sf-"+formatHex16(uint64(c.base))+".sfa") + seg, err := qwpSfCreateSegment(path, c.base, 4096) + require.NoError(t, err) + for i := 0; i < c.frames; i++ { + _, err := seg.tryAppend([]byte{byte(i)}) + require.NoError(t, err) + } + require.NoError(t, seg.close()) + } + r, err := qwpSfOpenRing(dir, 4096) + require.Error(t, err) + assert.Contains(t, err.Error(), "FSN gap") + assert.Nil(t, r) +} + +func TestQwpSfRingOpenExistingQuarantinesCorruptFirstFrame(t *testing.T) { + // A bit-flip in the first frame's CRC makes scanFrames bail out at + // HEADER_SIZE with frameCount=0 — but valid frames may follow. The + // pre-fix recovery path would silently unlink the file as an "empty + // hot spare", destroying every surviving frame. The fix quarantines + // torn-tail-bearing files to .corrupt instead so a postmortem + // can recover what's left. + dir := t.TempDir() + path := filepath.Join(dir, "sf-corrupt.sfa") + { + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + _, err = seg.tryAppend([]byte("frame-zero")) + require.NoError(t, err) + _, err = seg.tryAppend([]byte("frame-one")) + require.NoError(t, err) + // Flip a byte in frame[0]'s CRC. The frame is at HEADER_SIZE; + // CRC is the first 4 bytes of the frame. + buf := seg.address() + buf[qwpSfHeaderSize] ^= 0xFF + require.NoError(t, seg.close()) + } + + r, err := qwpSfOpenRing(dir, 4096) + require.NoError(t, err) + assert.Nil(t, r) + + // Original file is gone; quarantine sentinel is in its place. + _, statErr := os.Stat(path) + assert.True(t, os.IsNotExist(statErr), "original .sfa should have been renamed") + _, statErr = os.Stat(path + ".corrupt") + assert.NoError(t, statErr, ".corrupt should exist after quarantine") +} + +func TestQwpSfRingAcknowledgeClampsAtPublishedFsn(t *testing.T) { + // Defense-in-depth: a malformed/poisoned ACK with a wireSeq beyond + // publishedFsn must NOT advance ackedFsn past what the producer has + // actually written, otherwise the segment manager could trim + // segments the I/O thread is still iterating. + seg, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + r := qwpSfNewSegmentRing(seg, 4096) + defer func() { _ = r.segmentRingClose() }() + + r.appendOrFsn([]byte("a")) + r.appendOrFsn([]byte("b")) + require.Equal(t, int64(1), r.segmentRingPublishedFsn()) + + r.acknowledge(1 << 30) + assert.Equal(t, int64(1), r.segmentRingAckedFsn()) +} + +// formatHex16 mirrors the segment-manager filename format. +func formatHex16(v uint64) string { + const hex = "0123456789abcdef" + out := make([]byte, 16) + for i := 15; i >= 0; i-- { + out[i] = hex[v&0xF] + v >>= 4 + } + return string(out) +} diff --git a/qwp_sf_round_walk.go b/qwp_sf_round_walk.go new file mode 100644 index 00000000..b4c8de5a --- /dev/null +++ b/qwp_sf_round_walk.go @@ -0,0 +1,539 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "math/rand" + "strings" + "time" +) + +// qwpSfRoundWalkResult is returned by qwpSfRunRoundWalk on exit and +// captures everything the caller needs to wrap into the appropriate +// SenderError surface (success, terminal, or budget-exhausted). +type qwpSfRoundWalkResult struct { + // Transport is non-nil on success; the caller takes ownership and + // must close it on shutdown. + Transport *qwpTransport + // Idx is the host index Transport was bound to, or -1 on + // failure. Callers should record this back into their per-caller + // previousIdx slot so the next round-walk (after a mid-stream + // failure) can demote correctly. + Idx int + // Attempts counts dial attempts during this walk (success + // returns the number of attempts including the successful one). + Attempts int + // Terminal is a non-nil typed reject when an Auth-error (401/403) + // halts the walk per failover.md §6. Callers convert it to a + // CategorySecurityError SenderError. + Terminal *QwpUpgradeRejectError + // Exhausted is non-nil when the wall-clock budget ran out. Wraps + // the last underlying dial error plus a per-host snapshot for + // diagnostics. + Exhausted *qwpSfRoundWalkExhaustedError + // Cancelled is non-nil when ctx or cancelCh fired during the + // walk. Holds ctx.Err() so the caller can decide whether to + // shut down silently or surface the cancellation. + Cancelled error +} + +// qwpSfRoundWalkExhaustedError surfaces a per-outage summary when +// the round-walk runs out of wall-clock budget without binding. The +// per-host outcomes lift the spec §13.4 diagnostics intent into the +// error payload so the user-visible SenderError can name which hosts +// role-rejected vs transport-errored. +type qwpSfRoundWalkExhaustedError struct { + // Elapsed is the wall-clock time the outage consumed (from the + // first failed dial to budget exhaustion). + Elapsed time.Duration + // Attempts is the total dial attempts during the outage. + Attempts int + // LastError is the most recent dial failure, exposed via Unwrap. + LastError error + // HostOutcomes is a snapshot of the tracker's per-host entries + // at exhaustion. The slice index matches the connect-string + // addr= ordering. + HostOutcomes []qwpHostEntry + // Endpoints, when non-nil, is the parallel list of addresses + // the walk attempted, in addr= order. Lets the error message + // surface "h1:9000 role-rejected, h2:9000 transport-error". + // Optional — single-host callers may leave it nil. + Endpoints []qwpEndpoint +} + +// Error implements the error interface. The format is intentionally +// machine-friendly so the SenderError.ServerMessage can carry it +// verbatim and downstream log parsers can pick out the structured +// pieces. +func (e *qwpSfRoundWalkExhaustedError) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "reconnect budget exhausted after %s / %d attempts", + e.Elapsed.Round(time.Millisecond), e.Attempts) + if len(e.HostOutcomes) > 0 { + b.WriteString(" (host outcomes:") + for i, h := range e.HostOutcomes { + addr := "" + if i < len(e.Endpoints) { + addr = " " + e.Endpoints[i].String() + } + fmt.Fprintf(&b, " [%d%s state=%s zone=%s]", i, addr, h.state, h.zoneTier) + } + b.WriteString(")") + } + if e.LastError != nil { + fmt.Fprintf(&b, ": %v", e.LastError) + } + return b.String() +} + +// Unwrap exposes the last underlying error so errors.Is / errors.As +// can match on the dial failure beneath the exhaustion wrapper. +func (e *qwpSfRoundWalkExhaustedError) Unwrap() error { + return e.LastError +} + +// qwpSfRoundWalkParams bundles the immutable inputs of the walk so +// the call site stays readable. Built once per logical caller and +// reused across reconnect cycles. +type qwpSfRoundWalkParams struct { + // Factory dials the host at the given index. Implementations + // own the idx → URL/auth/TLS mapping (see + // qwpSfBuildEndpointFactory). May ignore idx for single-host + // callers that ship a 1-host tracker. + Factory qwpSfReconnectFactory + // Tracker is the failover.md §2 host-health tracker. MUST have + // Len() >= 1; the round-walk does not synthesize an implicit + // one. + Tracker *qwpHostTracker + // Endpoints, when non-nil, is the parallel list of addresses + // for budget-exhausted error formatting only. The factory owns + // dial; endpoints[i] is purely diagnostic. + Endpoints []qwpEndpoint + // MaxDuration is the wall-clock outage budget + // (reconnect_max_duration_millis per failover.md §7). + MaxDuration time.Duration + // InitialBackoff is the smallest pre-jitter sleep at round + // exhaustion (reconnect_initial_backoff_millis). + InitialBackoff time.Duration + // MaxBackoff caps the pre-jitter sleep (reconnect_max_backoff_millis). + // Post-jitter sleep may exceed it (equal-jitter shape). + MaxBackoff time.Duration + // OnAttempt, when non-nil, fires before each dial so callers + // can bump observability counters (totalReconnectAttempts, + // per-attempt status, etc.). + OnAttempt func() +} + +// qwpSfSingleRoundResult is the inner-loop return shape for one walk +// through every unattempted host in the tracker. qwpSfRunRoundWalk +// wraps this in a multi-round backoff loop; the InitialConnectOff +// branch in newQwpCursorLineSenderFromConf calls qwpSfRunSingleRound +// directly so a multi-host config still gets a full sweep on initial +// connect (failover.md §1.2 / §4.2; Java parity with +// QwpWebSocketSender.buildAndConnect). +// +// Exactly one of Transport / Terminal / Cancelled is non-nil on +// non-exhaustion exits. When all three are nil, the round was +// exhausted (every host attempted, no bind) and LastError / +// LastWasRoleReject describe the last dial. +type qwpSfSingleRoundResult struct { + // Transport is non-nil on success; caller takes ownership. + Transport *qwpTransport + // Idx is the bound endpoint index, or -1 on any non-success exit. + Idx int + // Attempts is the dial count consumed during this round + // (success inclusive). + Attempts int + // Terminal is set when the walk hits a 401/403 upgrade reject — + // per failover.md §6, auth errors short-circuit failover. + Terminal *QwpUpgradeRejectError + // Cancelled is ctx.Err() (or context.Canceled when cancelCh + // fired) when the walk was interrupted. Also non-nil for + // misconfigurations (nil tracker / factory) so callers route + // both via the same exit branch. + Cancelled error + // LastError is the most recent dial failure when the round + // exhausted. Nil on success / terminal / cancelled exits. + LastError error + // LastWasRoleReject indicates the most recent failure was a + // role-reject (421 + role header, or a SERVER_INFO role mismatch + // on the egress connect-walk). Drives the outer loop's round- + // boundary backoff selection per §3.2. + LastWasRoleReject bool +} + +// qwpSfRunSingleRound walks every unattempted host in the tracker +// once, dialing each via params.Factory and classifying the outcome. +// Returns on the first of: +// +// - successful bind (Transport set); +// - terminal AuthError 401/403 (Terminal set) — failover.md §6; +// - ctx or cancelCh cancellation (Cancelled set); +// - round exhaustion (PickNext returns -1, no remaining +// unattempted hosts). +// +// On exhaustion this function does NOT sleep and does NOT call +// BeginRound — those belong to the multi-round outer loop. Callers +// running a single-round walk (the InitialConnectOff branch) treat +// exhaustion as the terminal "all endpoints unreachable" condition. +// +// previousIdx >= 0 triggers RecordMidStreamFailure before the first +// PickNext (failover.md §2.3 ordering invariant). Pass -1 when no +// prior bind exists (initial connect). +func qwpSfRunSingleRound( + ctx context.Context, + cancelCh <-chan struct{}, + params qwpSfRoundWalkParams, + previousIdx int, +) qwpSfSingleRoundResult { + if params.Tracker == nil || params.Tracker.Len() == 0 { + return qwpSfSingleRoundResult{ + Idx: -1, + Cancelled: fmt.Errorf("qwp/sf: round-walk requires a non-empty tracker"), + } + } + if params.Factory == nil { + return qwpSfSingleRoundResult{ + Idx: -1, + Cancelled: fmt.Errorf("qwp/sf: round-walk requires a factory"), + } + } + + var ( + attempts int + lastErr error + lastWasRoleReject bool + ) + + // Apply pending mid-stream demote before the first PickNext. + // failover.md §2.3 normative ordering: reverse this and + // sticky-Healthy preserves the just-failed host, putting it back + // at the top of priority. + if previousIdx >= 0 { + params.Tracker.RecordMidStreamFailure(previousIdx) + } + + for { + if err := ctx.Err(); err != nil { + return qwpSfSingleRoundResult{Idx: -1, Cancelled: err, Attempts: attempts} + } + if cancelCh != nil { + select { + case <-cancelCh: + return qwpSfSingleRoundResult{ + Idx: -1, + Cancelled: context.Canceled, + Attempts: attempts, + } + default: + } + } + + idx := params.Tracker.PickNext() + if idx < 0 { + return qwpSfSingleRoundResult{ + Idx: -1, + Attempts: attempts, + LastError: lastErr, + LastWasRoleReject: lastWasRoleReject, + } + } + + // Dial host[idx]. + if params.OnAttempt != nil { + params.OnAttempt() + } + attempts++ + t, err := params.Factory(ctx, idx) + if err == nil && t != nil { + // A successful upgrade binds unconditionally. The ingress + // endpoint sends no SERVER_INFO frame and the client never + // expects one (per the wire spec, ingress is role- and + // zone-blind), so this path has no server role to filter on + // — and needs none: the server itself 421-rejects an ingress + // upgrade to a REPLICA or PRIMARY_CATCHUP node (with + // X-QuestDB-Role), so any node that completes the upgrade is + // write-eligible. Those 421s are classified as role rejects + // below; a clean upgrade means bind. target= (like zone=) is + // thus accepted at config time but inert here — re-rejecting + // a node the server already accepted would only + // connect/close-storm until the reconnect budget expired. + // Ingress trackers are built with qwpTargetAny regardless, + // so this path never observes a non-Any filter. + params.Tracker.RecordSuccess(idx) + return qwpSfSingleRoundResult{ + Transport: t, + Idx: idx, + Attempts: attempts, + } + } + lastErr = err + + // Cancellation race: ctx (or cancelCh) may have fired while + // the dial was in flight, in which case err is just a + // wrapped context.Canceled / context.DeadlineExceeded — not + // a host failure. Recording it as a transport error would + // falsely demote a healthy host the caller simply stopped + // waiting for. Bail out before classification. + if cerr := ctx.Err(); cerr != nil { + return qwpSfSingleRoundResult{ + Idx: -1, + Cancelled: cerr, + Attempts: attempts, + } + } + if cancelCh != nil { + select { + case <-cancelCh: + return qwpSfSingleRoundResult{ + Idx: -1, + Cancelled: context.Canceled, + Attempts: attempts, + } + default: + } + } + + // Classify the failure. Typed *QwpUpgradeRejectError carries + // the precise spec-relevant fields; everything else is a + // generic transport error. + var rej *QwpUpgradeRejectError + if errors.As(err, &rej) { + // AuthError (401 / 403): terminal per §6. Bypass failover. + if rej.StatusCode == 401 || rej.StatusCode == 403 { + return qwpSfSingleRoundResult{ + Idx: -1, + Attempts: attempts, + Terminal: rej, + } + } + // X-QuestDB-Zone on a 421 reject is intentionally ignored + // on the SF-ingest path: the ingress walk does not route by + // zone, and the tracker is constructed with clientZone="" so + // every host stays Same anyway. The egress connect-walk + // consumes the same header in qwp_query_failover.go. + // 421 + non-empty role: role-reject (transient or topology). + // 421 without role, 404, 426, 503, etc.: generic transient. + if rej.IsRoleReject() { + params.Tracker.RecordRoleReject(idx, rej.IsCatchupRole()) + lastWasRoleReject = true + continue + } + params.Tracker.RecordTransportError(idx) + lastWasRoleReject = false + continue + } + + // Non-upgrade-reject failure: TCP/TLS dial error, + // response-header timeout, etc. — all transient. + params.Tracker.RecordTransportError(idx) + lastWasRoleReject = false + } +} + +// qwpSfRunRoundWalk drives the failover.md §13.6 multi-round walk: +// each round calls qwpSfRunSingleRound; on exhaustion it pays one +// round-boundary sleep (equal-jitter exponential for transport +// rounds, flat InitialBackoff for role-reject rounds per §3.2), +// clamped to the remaining budget, then BeginRound(true) and +// retries. Returns on success, terminal AuthError, budget +// exhaustion, or cancellation. +// +// The result enum tells the caller which exit path was taken; only +// one of Transport / Terminal / Exhausted / Cancelled is non-nil. +// ctx is the master context; cancelCh, when non-nil, provides a +// secondary cancellation channel (used to distinguish "user close" +// from "ctx cancelled"). +func qwpSfRunRoundWalk( + ctx context.Context, + cancelCh <-chan struct{}, + params qwpSfRoundWalkParams, + previousIdx int, +) qwpSfRoundWalkResult { + outageStart := time.Now() + backoffAttempt := 0 + totalAttempts := 0 + enteringPreviousIdx := previousIdx + + for { + rr := qwpSfRunSingleRound(ctx, cancelCh, params, enteringPreviousIdx) + // previousIdx only demotes on the first inner call. Subsequent + // rounds enter with -1 so a stale slot doesn't double-demote. + enteringPreviousIdx = -1 + totalAttempts += rr.Attempts + + if rr.Transport != nil { + return qwpSfRoundWalkResult{ + Transport: rr.Transport, + Idx: rr.Idx, + Attempts: totalAttempts, + } + } + if rr.Terminal != nil { + return qwpSfRoundWalkResult{ + Idx: -1, + Attempts: totalAttempts, + Terminal: rr.Terminal, + } + } + if rr.Cancelled != nil { + return qwpSfRoundWalkResult{ + Idx: -1, + Cancelled: rr.Cancelled, + Attempts: totalAttempts, + } + } + + // Round exhausted. Pay one round-boundary sleep or terminate + // if the budget is gone. + elapsed := time.Since(outageStart) + if elapsed >= params.MaxDuration { + return qwpSfRoundWalkResult{ + Idx: -1, + Attempts: totalAttempts, + Exhausted: buildExhaustedError( + params.Tracker, params.Endpoints, elapsed, totalAttempts, rr.LastError), + } + } + var sleep time.Duration + if rr.LastWasRoleReject { + // Role-reject: no exponential doubling. ComputeBackoff(0) + // surfaces as EqualJitter(InitialBackoff). Reset the + // counter so a subsequent transport-only round doesn't + // inherit a stale attempt count. + sleep = qwpSfComputeBackoff(0, params.InitialBackoff, params.MaxBackoff) + backoffAttempt = 0 + } else { + sleep = qwpSfComputeBackoff(backoffAttempt, params.InitialBackoff, params.MaxBackoff) + backoffAttempt++ + } + remaining := params.MaxDuration - elapsed + if remaining <= 0 { + return qwpSfRoundWalkResult{ + Idx: -1, + Attempts: totalAttempts, + Exhausted: buildExhaustedError( + params.Tracker, params.Endpoints, elapsed, totalAttempts, rr.LastError), + } + } + if sleep > remaining { + sleep = remaining + } + // Sleep interruptible by ctx + cancelCh. + if !qwpSfSleepInterruptible(ctx, cancelCh, sleep) { + return qwpSfRoundWalkResult{ + Idx: -1, + Cancelled: context.Canceled, + Attempts: totalAttempts, + } + } + params.Tracker.BeginRound(true) + } +} + +// buildExhaustedError snapshots the tracker and packages the +// per-host outcomes into a typed *qwpSfRoundWalkExhaustedError. +// Pure formatter; no I/O. +func buildExhaustedError( + tracker *qwpHostTracker, + endpoints []qwpEndpoint, + elapsed time.Duration, + attempts int, + lastErr error, +) *qwpSfRoundWalkExhaustedError { + if lastErr == nil { + lastErr = errors.New("no dial attempts succeeded") + } + return &qwpSfRoundWalkExhaustedError{ + Elapsed: elapsed, + Attempts: attempts, + LastError: lastErr, + HostOutcomes: tracker.snapshot(), + Endpoints: endpoints, + } +} + +// qwpSfComputeBackoff implements the failover.md §3 backoff +// function: doubling InitialBackoff up to MaxBackoff with +// saturate-before-overflow, then equal-jitter `[base, 2·base)`. +// The post-jitter sleep is NOT clamped to MaxBackoff — once base +// saturates the cap, the actual sleep lands in [max, 2·max), per +// the SF spec's intent that the post-jitter window stays positive. +// +// attempt is 0-based; ComputeBackoff(0) returns +// EqualJitter(InitialBackoff). The function is pure; callers +// supply the deadline check separately. +func qwpSfComputeBackoff(attempt int, initial, max time.Duration) time.Duration { + if initial <= 0 { + return 0 + } + base := initial + for i := 0; i < attempt && base < max; i++ { + if base > max/2 { + base = max + break + } + base *= 2 + } + if base > max { + base = max + } + if base <= 0 { + return 0 + } + // Equal-jitter: [base, 2*base). rand.Int63n requires a positive + // argument; the base > 0 guard above keeps that contract. + return base + time.Duration(rand.Int63n(int64(base))) +} + +// qwpSfSleepInterruptible blocks for d, returning early when ctx +// expires or cancelCh fires. Returns true if the full sleep +// completed, false if interrupted. Zero d returns immediately. +func qwpSfSleepInterruptible(ctx context.Context, cancelCh <-chan struct{}, d time.Duration) bool { + if d <= 0 { + return true + } + t := time.NewTimer(d) + defer t.Stop() + if cancelCh == nil { + select { + case <-t.C: + return true + case <-ctx.Done(): + return false + } + } + select { + case <-t.C: + return true + case <-ctx.Done(): + return false + case <-cancelCh: + return false + } +} diff --git a/qwp_sf_round_walk_test.go b/qwp_sf_round_walk_test.go new file mode 100644 index 00000000..f61d9904 --- /dev/null +++ b/qwp_sf_round_walk_test.go @@ -0,0 +1,1045 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// newRoundWalkRejectServer returns an httptest server that responds +// to every upgrade with the given status + headers. Used to drive +// 421 / 401 / 404 / etc. classification in the round-walk. +func newRoundWalkRejectServer(t *testing.T, status int, headers http.Header) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + for k, vs := range headers { + for _, v := range vs { + w.Header().Add(k, v) + } + } + w.WriteHeader(status) + })) +} + +// newRoundWalkHealthyServer returns a server that accepts the WS +// upgrade. The QWP X-QWP-Version header is set to "1" so the +// transport's negotiation passes. +func newRoundWalkHealthyServer(t *testing.T) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + // Block until the client closes. + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + })) +} + +// hostPortOf extracts host:port from an httptest URL. +func hostPortOf(srv *httptest.Server) string { + return strings.TrimPrefix(srv.URL, "http://") +} + +// endpointForServer parses an httptest URL into a qwpEndpoint. +func endpointForServer(t *testing.T, srv *httptest.Server) qwpEndpoint { + t.Helper() + eps, err := parseEndpointList(hostPortOf(srv), qwpDefaultPort) + require.NoError(t, err) + require.Len(t, eps, 1) + return eps[0] +} + +// runWalkAgainst dials the configured tracker+endpoints and returns +// the result. Tests assert on the result struct fields. +func runWalkAgainst( + t *testing.T, + endpoints []qwpEndpoint, + tracker *qwpHostTracker, + previousIdx int, + maxDuration, initialBackoff, maxBackoff time.Duration, +) qwpSfRoundWalkResult { + t.Helper() + factory := qwpSfBuildEndpointFactory(endpoints, "ws", qwpTransportOpts{ + endpointPath: qwpWritePath, + }, nil) + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + Endpoints: endpoints, + MaxDuration: maxDuration, + InitialBackoff: initialBackoff, + MaxBackoff: maxBackoff, + } + return qwpSfRunRoundWalk(context.Background(), nil, params, previousIdx) +} + +// TestRoundWalkBindsHealthyPeerWhenFirstRoleRejects verifies that +// when host 0 returns 421+PRIMARY_CATCHUP and host 1 accepts, the +// walk lands on host 1 within a single round (no inter-host sleep). +func TestRoundWalkBindsHealthyPeerWhenFirstRoleRejects(t *testing.T) { + rejectSrv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }) + defer rejectSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, rejectSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + start := time.Now() + result := runWalkAgainst(t, endpoints, tracker, -1, + 5*time.Second, 100*time.Millisecond, 1*time.Second) + elapsed := time.Since(start) + + require.NotNil(t, result.Transport, "expected successful bind") + defer result.Transport.close() + assert.Equal(t, 1, result.Idx, "should bind to healthy peer at idx=1") + assert.Less(t, elapsed, 500*time.Millisecond, + "single-round walk must NOT pay round-boundary sleep (skip-backoff-within-round)") + + // Tracker should record host 0 as TransientReject, host 1 as Healthy. + snap := tracker.snapshot() + assert.Equal(t, qwpHostTransientReject, snap[0].state) + assert.Equal(t, qwpHostHealthy, snap[1].state) +} + +// TestRoundWalkBindsHealthyPeerWhenFirstTransportErrors verifies the +// transport-error fallthrough: host 0 refuses TCP (unreachable port), +// host 1 accepts, walk lands on host 1. +func TestRoundWalkBindsHealthyPeerWhenFirstTransportErrors(t *testing.T) { + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + // Use a port that's almost certainly closed. + endpoints := []qwpEndpoint{ + {host: "127.0.0.1", port: 1}, // port 1 = no service + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + result := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + + require.NotNil(t, result.Transport) + defer result.Transport.close() + assert.Equal(t, 1, result.Idx, "must bind to healthy peer despite host 0 dial failure") + snap := tracker.snapshot() + assert.Equal(t, qwpHostTransportError, snap[0].state) + assert.Equal(t, qwpHostHealthy, snap[1].state) +} + +// TestRoundWalk404IsTransient is the 2026-05-08 reclassification: +// a 404 on one peer must NOT terminate the walk; the round-walk +// continues to a healthy sibling. +func TestRoundWalk404IsTransient(t *testing.T) { + notFoundSrv := newRoundWalkRejectServer(t, 404, http.Header{}) + defer notFoundSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, notFoundSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + result := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + + require.NotNil(t, result.Transport, "404 must walk through to healthy peer, not terminate") + defer result.Transport.close() + assert.Equal(t, 1, result.Idx) +} + +// TestRoundWalk426IsTransient: same reasoning as 404 — protocol +// version mismatch on one peer (rolling upgrade artifact) must not +// lock the client out of compatible siblings. +func TestRoundWalk426IsTransient(t *testing.T) { + upgradeSrv := newRoundWalkRejectServer(t, 426, http.Header{}) + defer upgradeSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, upgradeSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + result := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + + require.NotNil(t, result.Transport) + defer result.Transport.close() + assert.Equal(t, 1, result.Idx) +} + +// TestRoundWalkAuthErrorIsTerminal verifies that 401/403 short- +// circuits the walk — even if other peers might be reachable, the +// failover-loop spec treats AuthError as cluster-wide. +func TestRoundWalkAuthErrorIsTerminal(t *testing.T) { + authSrv := newRoundWalkRejectServer(t, 401, http.Header{}) + defer authSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, authSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + result := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + + assert.Nil(t, result.Transport) + require.NotNil(t, result.Terminal, "401 must surface as Terminal QwpUpgradeRejectError") + assert.Equal(t, 401, result.Terminal.StatusCode) + // Tracker should NOT have host 1 as Healthy — the walk bailed + // before reaching it. + snap := tracker.snapshot() + assert.NotEqual(t, qwpHostHealthy, snap[1].state) +} + +// TestRoundWalkBudgetExhaustsOnAllRoleReject: every peer responds +// 421+CATCHUP for the full outage window. The walk must pay a +// round-boundary sleep at each round exhaustion (InitialBackoff +// equal-jitter, no doubling) and terminate when the budget runs out. +func TestRoundWalkBudgetExhaustsOnAllRoleReject(t *testing.T) { + srv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }) + defer srv.Close() + + endpoints := []qwpEndpoint{endpointForServer(t, srv)} + tracker := newQwpHostTracker(1, "", qwpTargetAny) + + // Tight budget; each round-boundary sleep is ~10-20ms. + start := time.Now() + result := runWalkAgainst(t, endpoints, tracker, -1, + 200*time.Millisecond, 10*time.Millisecond, 30*time.Millisecond) + elapsed := time.Since(start) + + assert.Nil(t, result.Transport) + require.NotNil(t, result.Exhausted, "budget must exhaust, not terminate") + assert.Greater(t, result.Attempts, 1, "must have made several role-reject attempts") + assert.GreaterOrEqual(t, elapsed, 200*time.Millisecond, + "must consume the full budget before exhaustion") + // Per-host outcome surfaces in Error(). + msg := result.Exhausted.Error() + assert.Contains(t, msg, "TransientReject", + "exhausted error must surface the per-host classification: %s", msg) +} + +// TestRoundWalkBudgetExhaustsOnAllTransport: every peer dial-fails +// (closed port). Backoff doubling between rounds; eventual +// exhaustion with TransportError as the per-host outcome. +func TestRoundWalkBudgetExhaustsOnAllTransport(t *testing.T) { + endpoints := []qwpEndpoint{{host: "127.0.0.1", port: 1}} + tracker := newQwpHostTracker(1, "", qwpTargetAny) + + result := runWalkAgainst(t, endpoints, tracker, -1, + 150*time.Millisecond, 10*time.Millisecond, 50*time.Millisecond) + assert.Nil(t, result.Transport) + require.NotNil(t, result.Exhausted) + msg := result.Exhausted.Error() + assert.Contains(t, msg, "TransportError", "exhausted msg: %s", msg) +} + +// TestRoundWalkMidStreamDemoteBeforePickNext verifies the §2.3 +// ordering invariant: a non-negative previousIdx must demote +// before the first PickNext. We bind host 0 as Healthy, then +// simulate a mid-stream failure (previousIdx=0), then re-walk — +// PickNext must NOT return 0 first. +func TestRoundWalkMidStreamDemoteBeforePickNext(t *testing.T) { + healthy1 := newRoundWalkHealthyServer(t) + defer healthy1.Close() + healthy2 := newRoundWalkHealthyServer(t) + defer healthy2.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, healthy1), + endpointForServer(t, healthy2), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + // First walk binds host 0. + r1 := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, r1.Transport) + require.Equal(t, 0, r1.Idx) + _ = r1.Transport.close() + + // Simulate mid-stream failure on host 0: re-walk with previousIdx=0. + r2 := runWalkAgainst(t, endpoints, tracker, 0, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, r2.Transport) + defer r2.Transport.close() + assert.Equal(t, 1, r2.Idx, + "mid-stream demote must run before PickNext; host 0 should be TransportError-priority now") +} + +// TestRoundWalkExhaustedErrorIncludesPerHostOutcomes verifies that +// the SenderError's ServerMessage (built from result.Exhausted) lists +// each configured endpoint with its final state. +func TestRoundWalkExhaustedErrorIncludesPerHostOutcomes(t *testing.T) { + catchupSrv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }) + defer catchupSrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, catchupSrv), + {host: "127.0.0.1", port: 1}, // closed port → TransportError + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + result := runWalkAgainst(t, endpoints, tracker, -1, + 150*time.Millisecond, 5*time.Millisecond, 30*time.Millisecond) + + require.NotNil(t, result.Exhausted) + msg := result.Exhausted.Error() + assert.Contains(t, msg, "TransientReject", "msg: %s", msg) + assert.Contains(t, msg, "TransportError", "msg: %s", msg) + assert.Contains(t, msg, endpoints[0].String(), "msg: %s", msg) + assert.Contains(t, msg, endpoints[1].String(), "msg: %s", msg) +} + +// TestRoundWalkCancellation: ctx cancellation mid-walk surfaces as +// the Cancelled exit path, not Exhausted. +func TestRoundWalkCancellation(t *testing.T) { + srv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }) + defer srv.Close() + + endpoints := []qwpEndpoint{endpointForServer(t, srv)} + tracker := newQwpHostTracker(1, "", qwpTargetAny) + + factory := qwpSfBuildEndpointFactory(endpoints, "ws", qwpTransportOpts{ + endpointPath: qwpWritePath, + }, nil) + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + Endpoints: endpoints, + MaxDuration: 10 * time.Second, + InitialBackoff: 50 * time.Millisecond, + MaxBackoff: 200 * time.Millisecond, + } + ctx, cancel := context.WithCancel(context.Background()) + // Cancel after a brief delay so at least one round happens first. + go func() { + time.Sleep(80 * time.Millisecond) + cancel() + }() + result := qwpSfRunRoundWalk(ctx, nil, params, -1) + assert.Nil(t, result.Transport) + assert.Nil(t, result.Exhausted) + require.NotNil(t, result.Cancelled) + assert.True(t, errors.Is(result.Cancelled, context.Canceled)) +} + +// TestComputeBackoffSaturatesBeforeOverflow exercises the spec's +// "saturate before doubling" guarantee at the integer boundary. +// The function must NOT overflow time.Duration even for very large +// attempt counts. +func TestComputeBackoffSaturatesBeforeOverflow(t *testing.T) { + initial := 100 * time.Millisecond + max := 5 * time.Second + for _, attempt := range []int{0, 1, 5, 10, 30, 60, 100} { + got := qwpSfComputeBackoff(attempt, initial, max) + // Equal-jitter: [base, 2*base). For high attempts, base + // saturates at max, so result is [max, 2*max). + assert.GreaterOrEqual(t, got, initial, + "attempt=%d: backoff must be at least InitialBackoff", attempt) + assert.Less(t, got, 2*max, + "attempt=%d: backoff must not exceed 2*max", attempt) + } +} + +// TestComputeBackoffEqualJitterShape probabilistically verifies the +// equal-jitter window for attempt=0. Across many samples, every +// observation must fall in [InitialBackoff, 2*InitialBackoff). +func TestComputeBackoffEqualJitterShape(t *testing.T) { + initial := 100 * time.Millisecond + max := 1 * time.Second + for i := 0; i < 200; i++ { + got := qwpSfComputeBackoff(0, initial, max) + assert.GreaterOrEqual(t, got, initial, + "sample %d: %v < %v", i, got, initial) + assert.Less(t, got, 2*initial, + "sample %d: %v >= %v", i, got, 2*initial) + } +} + +// Full-stack reconnect-and-rebind integration is covered by the +// existing TestQwpSfSendLoop* suite (which now goes through the +// implicit 1-host tracker code path). The tests above pin the +// round-walk semantics in isolation; the send-loop integration +// tests prove the wiring works end-to-end. + +// --- ingress is role-blind: target= is accepted but inert --- + +// TestRoundWalkIngressIgnoresTargetFilter pins the accepted-but-inert +// contract for target= on the SF ingress path. The ingress connect +// path does not route by server role — role-based endpoint selection +// is an egress-only feature — so the round-walk binds the first +// healthy peer regardless of the tracker's target filter and records +// it Healthy. It does not demote peers to TopologyReject, which would +// connect/close-storm (no host can satisfy a filter the ingress walk +// never evaluates). The filter is honoured on the egress connect-walk +// (see qwp_failover_test.go). +// +// Production always builds the ingress tracker with qwpTargetAny (see +// qwp_sender_cursor.go), so a non-Any filter never even reaches this +// code; the test feeds one directly to prove the round-walk itself is +// target-agnostic. +func TestRoundWalkIngressIgnoresTargetFilter(t *testing.T) { + for _, target := range []QwpTargetFilter{qwpTargetAny, qwpTargetPrimary, qwpTargetReplica} { + t.Run(target.String(), func(t *testing.T) { + srv := newRoundWalkHealthyServer(t) + defer srv.Close() + endpoints := []qwpEndpoint{endpointForServer(t, srv)} + tracker := newQwpHostTracker(1, "", target) + result := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, result.Transport, + "ingress must bind a healthy peer regardless of target=%s", target) + defer result.Transport.close() + snap := tracker.snapshot() + assert.Equal(t, qwpHostHealthy, snap[0].state, + "bound host must be recorded Healthy, not TopologyReject") + }) + } +} + +// TestRoundWalkPerCallerPreviousIdxIsolation pins down the +// failover.md §2.3 invariant: two callers (foreground SF loop + +// orphan drainer) sharing one tracker MUST use private previousIdx +// slots. A mid-stream demote from caller A on idx=0 must not +// disturb caller B's idx=1 bind. +// +// Setup mirrors what Phase 5 wires up in production: +// - 1 shared tracker, 2 hosts (both healthy). +// - Caller A binds idx=0; caller B binds idx=1. +// - Caller A "loses" its connection mid-stream and re-enters the +// round-walk with previousIdx=0. Caller B is unaffected — its +// local previousIdx slot stays at 1. +func TestRoundWalkPerCallerPreviousIdxIsolation(t *testing.T) { + healthy0 := newRoundWalkHealthyServer(t) + defer healthy0.Close() + healthy1 := newRoundWalkHealthyServer(t) + defer healthy1.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, healthy0), + endpointForServer(t, healthy1), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + // Caller A: binds idx=0. + rA := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, rA.Transport) + defer rA.Transport.close() + require.Equal(t, 0, rA.Idx) + + // Caller B: binds idx=1 because idx=0 is Healthy-attempted + // (sticky-Healthy preserves it, but `attempted` is set since + // caller A consumed its round slot). After BeginRound(false) + // caller B starts fresh — let's simulate that explicitly so + // the test setup reflects "two independent callers, each + // running its own round". + tracker.BeginRound(false) + // Even with attempted cleared, the lower-index Healthy host + // wins PickNext (priority (Healthy, Same)). To force caller B + // onto idx=1 we treat caller A's bound idx as "attempted" for + // caller B's round — exactly the mid-stream demote signal the + // real send loop applies to its OWN bound host on pump exit. + // Here, we mimic the production wiring: caller B's local + // previousIdx is -1 (it has no prior bind), and caller A's + // previousIdx=0 is what caller A would consume. + rB := runWalkAgainst(t, endpoints, tracker, -1, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, rB.Transport) + defer rB.Transport.close() + // Either bind is structurally correct (both Healthy, same + // zone tier) — what we're really pinning is the per-caller + // slot semantics next. + + // Now: caller A loses its connection mid-stream. Caller A + // re-walks with previousIdx=0 (its own bound idx); caller B + // is untouched. After caller A's walk, caller B's bind must + // still be valid (no one called RecordMidStreamFailure on + // caller B's idx). + rA2 := runWalkAgainst(t, endpoints, tracker, rA.Idx, + 2*time.Second, 50*time.Millisecond, 500*time.Millisecond) + require.NotNil(t, rA2.Transport, "caller A must reconnect successfully") + defer rA2.Transport.close() + // After the demote, host rA.Idx is now TransportError; caller + // A must end up on the other host. + assert.NotEqual(t, rA.Idx, rA2.Idx, + "after mid-stream demote, caller A must walk to the other host") + + // Caller B's `previousIdx` is the test's local variable (rB.Idx). + // Caller A's mid-stream walk did NOT touch it. Sanity-check by + // snapshotting the tracker: rB.Idx must still be Healthy + // (the sticky-Healthy preservation across BeginRound(true) + // keeps it so), proving the demote was scoped to caller A's + // host only. + snap := tracker.snapshot() + assert.NotEqual(t, qwpHostHealthy, snap[rA.Idx].state, + "caller A's bound host should be demoted post mid-stream") +} + +// --- qwpSfRunSingleRound: the per-round primitive --- + +// runSingleRoundAgainst dials the configured endpoints once via +// qwpSfRunSingleRound and returns the result. Tests assert on the +// inner-loop result shape (single-round, no inter-round sleep). +func runSingleRoundAgainst( + t *testing.T, + endpoints []qwpEndpoint, + tracker *qwpHostTracker, + previousIdx int, +) qwpSfSingleRoundResult { + t.Helper() + factory := qwpSfBuildEndpointFactory(endpoints, "ws", qwpTransportOpts{ + endpointPath: qwpWritePath, + }, nil) + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + Endpoints: endpoints, + } + return qwpSfRunSingleRound(context.Background(), nil, params, previousIdx) +} + +// TestRunSingleRoundBindsHealthyPeerWhenFirstRoleRejects is the +// per-round counterpart to TestRoundWalkBindsHealthyPeerWhenFirstRoleRejects: +// the inner walks every unattempted host once and binds the healthy +// peer without paying a round-boundary sleep. +func TestRunSingleRoundBindsHealthyPeerWhenFirstRoleRejects(t *testing.T) { + rejectSrv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"REPLICA"}, + }) + defer rejectSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, rejectSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + start := time.Now() + rr := runSingleRoundAgainst(t, endpoints, tracker, -1) + elapsed := time.Since(start) + + require.NotNil(t, rr.Transport, "expected successful bind on healthy peer") + defer rr.Transport.close() + assert.Equal(t, 1, rr.Idx, "must bind to healthy peer at idx=1") + assert.Less(t, elapsed, 500*time.Millisecond, + "single-round walk must NOT pay any inter-host sleep") + + snap := tracker.snapshot() + assert.Equal(t, qwpHostTopologyReject, snap[0].state, + "REPLICA reject without CATCHUP must classify as TopologyReject") + assert.Equal(t, qwpHostHealthy, snap[1].state) +} + +// TestRunSingleRoundExhaustsWithoutSleep verifies the exhaustion +// path: when every host is unreachable, the inner returns +// immediately with LastError set, without paying any round-boundary +// sleep. The outer multi-round wrapper is the one that pays sleeps; +// the inner is a pure walk. +func TestRunSingleRoundExhaustsWithoutSleep(t *testing.T) { + endpoints := []qwpEndpoint{ + {host: "127.0.0.1", port: 1}, + {host: "127.0.0.1", port: 2}, + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + + start := time.Now() + rr := runSingleRoundAgainst(t, endpoints, tracker, -1) + elapsed := time.Since(start) + + assert.Nil(t, rr.Transport) + assert.Nil(t, rr.Terminal) + assert.Nil(t, rr.Cancelled) + require.Error(t, rr.LastError, + "exhaustion must surface the most recent dial failure") + assert.Equal(t, 2, rr.Attempts, "every host must be attempted before exit") + assert.Less(t, elapsed, 2*time.Second, + "single-round exhaustion must not sleep; dial timeouts dominate") + + // Both hosts left as TransportError; attempted bits set. + snap := tracker.snapshot() + for i, h := range snap { + assert.Equal(t, qwpHostTransportError, h.state, "host %d state", i) + assert.True(t, h.attempted, "host %d attempted", i) + } +} + +// TestRunSingleRoundAuthErrorShortCircuits verifies that a 401 on +// host 0 causes the inner to return Terminal immediately, without +// dialing host 1 — auth is uniform across the cluster, walking on +// would just produce identical rejections (failover.md §6). +func TestRunSingleRoundAuthErrorShortCircuits(t *testing.T) { + authSrv := newRoundWalkRejectServer(t, 401, http.Header{}) + defer authSrv.Close() + healthySrv := newRoundWalkHealthyServer(t) + defer healthySrv.Close() + + endpoints := []qwpEndpoint{ + endpointForServer(t, authSrv), + endpointForServer(t, healthySrv), + } + tracker := newQwpHostTracker(2, "", qwpTargetAny) + rr := runSingleRoundAgainst(t, endpoints, tracker, -1) + + assert.Nil(t, rr.Transport) + require.NotNil(t, rr.Terminal, "401 must short-circuit as Terminal") + assert.Equal(t, 401, rr.Terminal.StatusCode) + assert.Equal(t, 1, rr.Attempts, "walk must stop after the auth-failing host") + assert.NotEqual(t, qwpHostHealthy, tracker.snapshot()[1].state, + "walk must not have reached the healthy peer") +} + +// TestRunSingleRoundCtxCancelDuringDialDoesNotDemote verifies the +// cancellation race fix: when ctx fires while params.Factory is +// in-flight, the returned dial error is a wrapped context.Canceled +// — not a host failure. The walk must surface it as Cancelled and +// leave the tracker's host state untouched, otherwise a healthy +// host gets spuriously demoted to TransportError just because the +// caller stopped waiting (e.g. drainer shutdown, sender Close +// during reconnect, a watchdog tripping mid-dial). +func TestRunSingleRoundCtxCancelDuringDialDoesNotDemote(t *testing.T) { + dialStarted := make(chan struct{}) + factory := func(ctx context.Context, _ int) (*qwpTransport, error) { + close(dialStarted) + <-ctx.Done() + return nil, fmt.Errorf("qwp: websocket dial: %w", ctx.Err()) + } + tracker := newQwpHostTracker(1, "", qwpTargetAny) + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + } + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + <-dialStarted + cancel() + }() + rr := qwpSfRunSingleRound(ctx, nil, params, -1) + + require.NotNil(t, rr.Cancelled, "ctx cancel during dial must surface as Cancelled") + assert.True(t, errors.Is(rr.Cancelled, context.Canceled)) + assert.Equal(t, -1, rr.Idx) + assert.Equal(t, 1, rr.Attempts, "the in-flight dial counts as one attempt") + + snap := tracker.snapshot() + assert.Equal(t, qwpHostUnknown, snap[0].state, + "cancelled dial must not demote the host to TransportError") +} + +// TestRunSingleRoundCancelChDuringDialDoesNotDemote is the +// cancelCh-channel counterpart. cancelCh exists so the send loop +// can distinguish "user close" from "ctx cancelled" — the fix must +// honour both signals symmetrically. +func TestRunSingleRoundCancelChDuringDialDoesNotDemote(t *testing.T) { + dialStarted := make(chan struct{}) + cancelCh := make(chan struct{}) + factory := func(_ context.Context, _ int) (*qwpTransport, error) { + close(dialStarted) + <-cancelCh + return nil, errors.New("qwp: websocket dial: connection refused") + } + tracker := newQwpHostTracker(1, "", qwpTargetAny) + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + } + + go func() { + <-dialStarted + close(cancelCh) + }() + rr := qwpSfRunSingleRound(context.Background(), cancelCh, params, -1) + + require.NotNil(t, rr.Cancelled, "cancelCh during dial must surface as Cancelled") + assert.True(t, errors.Is(rr.Cancelled, context.Canceled)) + assert.Equal(t, -1, rr.Idx) + assert.Equal(t, 1, rr.Attempts) + + snap := tracker.snapshot() + assert.Equal(t, qwpHostUnknown, snap[0].state, + "cancelCh-aborted dial must not demote the host to TransportError") +} + +// TestInitialConnectOffWalksMultiHostToHealthy is the spec-parity +// test: with `initial_connect_retry` left at its default (off), a +// connect string with multiple `addr=` entries must walk every host +// once and land on the healthy peer rather than failing on the +// first reject. Mirrors Java +// WriteFailoverTest.testOffModeSinglePassWalkFindsPrimary. +func TestInitialConnectOffWalksMultiHostToHealthy(t *testing.T) { + // Host 0: rejects with 421 + REPLICA (TopologyReject). + rejectSrv := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"REPLICA"}, + }) + defer rejectSrv.Close() + // Host 1: SF-compatible test server that ACKs frames. + healthySrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer healthySrv.Close() + + sfDir := t.TempDir() + addr0 := strings.TrimPrefix(rejectSrv.URL, "http://") + addr1 := strings.TrimPrefix(healthySrv.URL, "http://") + conf := fmt.Sprintf( + "ws::addr=%s,%s;sf_dir=%s;sender_id=t;close_flush_timeout_millis=2000;", + addr0, addr1, sfDir, + ) + + sender, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err, + "initial connect (default off) must walk past REPLICA and bind on healthy peer") + defer func() { _ = sender.Close(context.Background()) }() + + // Send a row and confirm it reached the healthy server — proves + // the bind landed on host 1, not on host 0 (which would have + // rejected the upgrade outright). Flush no longer blocks on the + // server ACK (the cursor architecture made local persistence, not + // the ACK, the durability guarantee — see CLAUDE.md), so the send + // loop delivers in the background; poll for receipt rather than + // reading the counter synchronously right after Flush. + require.NoError(t, sender.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, sender.Flush(context.Background())) + require.Eventually(t, func() bool { + return healthySrv.totalFramesReceived.Load() >= int64(1) + }, 2*time.Second, 1*time.Millisecond, + "the healthy peer must have received the test frame") +} + +// TestInitialConnectOffFailsWhenAllRejected: when every endpoint +// rejects on the initial single-round walk, the constructor must +// return a clear error rather than hanging or burning the reconnect +// budget. The error must name the walk and the attempt count. +func TestInitialConnectOffFailsWhenAllRejected(t *testing.T) { + r1 := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"REPLICA"}, + }) + defer r1.Close() + r2 := newRoundWalkRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }) + defer r2.Close() + + sfDir := t.TempDir() + addr0 := strings.TrimPrefix(r1.URL, "http://") + addr1 := strings.TrimPrefix(r2.URL, "http://") + conf := fmt.Sprintf( + "ws::addr=%s,%s;sf_dir=%s;sender_id=t;", + addr0, addr1, sfDir, + ) + + start := time.Now() + sender, err := LineSenderFromConf(context.Background(), conf) + elapsed := time.Since(start) + if sender != nil { + _ = sender.Close(context.Background()) + } + require.Error(t, err, "initial connect must fail when every endpoint rejects") + assert.Contains(t, err.Error(), "initial connect", + "error must identify the single-round walk: %v", err) + assert.Less(t, elapsed, 3*time.Second, + "failure must surface promptly; OFF mode must not retry across rounds") +} + +// TestQwpMemoryModeMultiHostFailsOverToHealthy is the regression test +// for review C2: memory mode (no sf_dir) must honour the multi-host +// addr= list exactly as SF mode does. The README's headline failover +// example (ws::addr=node-a,node-b,node-c;) is memory mode, so a dead +// first endpoint must not hard-fail the constructor — the sender has +// to walk past it and bind on the first healthy peer, just like the SF +// analog TestInitialConnectOffWalksMultiHostToHealthy above. +// +// Before the fix the memory path dialed only endpoints[0] (the +// sanitizer rewrote addr to endpoints[0]; the constructor did one +// synchronous dial through a single-host factory and installed no host +// tracker), so this connect returned the dead host's upgrade error. +func TestQwpMemoryModeMultiHostFailsOverToHealthy(t *testing.T) { + // Host 0: dead — rejects every upgrade with 503 (a "generic + // transient" the round walk steps past, per qwp_sf_round_walk.go). + dead := newRoundWalkRejectServer(t, http.StatusServiceUnavailable, nil) + defer dead.Close() + // Host 1: healthy SF-compatible server that ACKs frames. + healthy := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer healthy.Close() + + deadAddr := strings.TrimPrefix(dead.URL, "http://") + healthyAddr := strings.TrimPrefix(healthy.URL, "http://") + // NO sf_dir → memory mode. Identical addr shape to the SF analog. + conf := fmt.Sprintf("ws::addr=%s,%s;close_flush_timeout_millis=2000;", + deadAddr, healthyAddr) + + sender, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err, + "memory-mode multi-host connect must walk past the dead first endpoint and bind on the healthy peer") + defer func() { _ = sender.Close(context.Background()) }() + + require.NoError(t, sender.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, sender.Flush(context.Background())) + require.Eventually(t, func() bool { + return healthy.totalFramesReceived.Load() >= int64(1) + }, 2*time.Second, 1*time.Millisecond, + "the healthy peer must have received the row — proving the bind landed on host 1") +} + +// TestQwpMemoryModeThreadsFailoverConfig pins the rest of review C2: +// memory mode must thread the multi-host failover tracker AND the +// user's reconnect budget into the send loop, not discard them. The +// pre-fix memory path installed no tracker (so reconnect could never +// fail over off the first node) and hard-coded +// qwpSfDefaultReconnectMaxDuration (so user reconnect budgets were +// silently dropped). +func TestQwpMemoryModeThreadsFailoverConfig(t *testing.T) { + a := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer a.Close() + b := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer b.Close() + addrA := strings.TrimPrefix(a.URL, "http://") + addrB := strings.TrimPrefix(b.URL, "http://") + + // NO sf_dir → memory mode, with a non-default reconnect budget. + conf := fmt.Sprintf("ws::addr=%s,%s;reconnect_max_duration_millis=1234;", + addrA, addrB) + sender, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + defer func() { _ = sender.Close(context.Background()) }() + + s, ok := sender.(*qwpLineSender) + require.True(t, ok, "want *qwpLineSender, got %T", sender) + require.NotNil(t, s.cursorSendLoop.tracker, + "memory mode must install the multi-host failover tracker") + require.Equal(t, 2, s.cursorSendLoop.tracker.Len(), + "the tracker must cover both configured endpoints") + require.Equal(t, 1234*time.Millisecond, s.cursorSendLoop.reconnectMaxDuration, + "memory mode must thread the user's reconnect_max_duration_millis, not the 5-minute default") +} + +// newHangListener accepts TCP connections and parks them — never +// writes any HTTP response, so a client awaiting the WebSocket 101 +// upgrade response hangs until its auth_timeout_ms fires. Used by +// TestInitialConnectAuthTimeoutBoundsHungUpgrade to simulate a node +// that takes the connection but never completes the upgrade. +func newHangListener(t *testing.T) (addr string, teardown func()) { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err, "hang listener") + var ( + mu sync.Mutex + closed bool + conns []net.Conn + ) + done := make(chan struct{}) + go func() { + defer close(done) + for { + c, err := ln.Accept() + if err != nil { + return // listener closed + } + mu.Lock() + if closed { + mu.Unlock() + _ = c.Close() + return + } + conns = append(conns, c) + mu.Unlock() + // Park the connection. Set a long deadline so a buggy + // server-side read can't burn the test budget; we close + // from teardown. + _ = c.SetDeadline(time.Now().Add(time.Minute)) + } + }() + teardown = func() { + mu.Lock() + closed = true + toClose := append([]net.Conn(nil), conns...) + mu.Unlock() + _ = ln.Close() + for _, c := range toClose { + _ = c.Close() + } + <-done + } + return ln.Addr().String(), teardown +} + +// TestInitialConnectAuthTimeoutBoundsHungUpgrade is the spec-parity +// test for `auth_timeout_ms`: when host 0 accepts the TCP socket but +// never writes the WS 101 response, the sender's upgrade read must +// time out at auth_timeout_ms (per-host) and walk to host 1, which +// completes the upgrade and accepts frames. Without the per-host +// bound the connect would burn the entire reconnect budget (or the +// underlying HTTP transport default) on the stuck host. Mirrors Java +// WriteFailoverTest.testAuthTimeoutBoundsHungUpgrade. +func TestInitialConnectAuthTimeoutBoundsHungUpgrade(t *testing.T) { + hangAddr, closeHang := newHangListener(t) + defer closeHang() + + healthySrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer healthySrv.Close() + healthyAddr := strings.TrimPrefix(healthySrv.URL, "http://") + + sfDir := t.TempDir() + const authTimeoutMs = 500 + conf := fmt.Sprintf( + "ws::addr=%s,%s;sf_dir=%s;sender_id=t;auth_timeout_ms=%d;close_flush_timeout_millis=2000;", + hangAddr, healthyAddr, sfDir, authTimeoutMs, + ) + + t0 := time.Now() + sender, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err, + "sender must walk past the hung upgrade and bind on the healthy peer") + defer func() { _ = sender.Close(context.Background()) }() + + require.NoError(t, sender.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, sender.Flush(context.Background())) + require.Eventually(t, func() bool { + return healthySrv.totalFramesReceived.Load() >= int64(1) + }, 2*time.Second, 1*time.Millisecond, + "the healthy peer must have received the test frame") + + elapsed := time.Since(t0) + // Two-sided bound: host[0] MUST burn ~auth_timeout_ms (500 ms) + // before the walk moves on (lower bound catches a regression that + // short-circuits host[0]); host[1] connects quickly afterwards + // (upper bound catches a regression that lets the per-host timeout + // drift well past the configured value). + assert.GreaterOrEqual(t, elapsed, 400*time.Millisecond, + "host[0] must actually exercise auth_timeout_ms (~500 ms) before the walk moves on; elapsed=%v", elapsed) + assert.Less(t, elapsed, 2*time.Second, + "auth_timeout_ms must bound the hung upgrade close to the configured 500 ms; elapsed=%v", elapsed) +} + +// TestInitialConnectStaysOnPrimaryAfterTopologyChange — Go-side +// counterpart of Java WriteFailoverTest.testFailoverPromotedReplicaJoinsRotation. +// After the SF round-walk binds to the healthy primary, subsequent +// batches MUST keep landing on the bound peer even if a previously- +// rejecting host becomes topologically eligible (the "promoted +// replica" case). The Go cursor send loop does not observe topology +// changes on idle peers, so the bound endpoint stays sticky — this +// test pins that stickiness so a future scheduler hook can't quietly +// regress it into proactive rotation. +// +// We don't actually mutate the rejecting server mid-test (Go's +// httptest doesn't expose a clean "swap behaviour" toggle and the +// promotion is conceptually a no-op on the sender side anyway). +// What we assert is what matters: two successive batches on the +// same Sender both reach the originally-bound healthy peer, and +// the rejecting host receives exactly one upgrade attempt — the +// initial round-walk. A regressed sender that re-walks the ring +// on every flush would push that count past 1, which is the +// regression this test exists to catch. +func TestInitialConnectStaysOnPrimaryAfterTopologyChange(t *testing.T) { + // Host 0: rejects with 421 + REPLICA — the SF round-walk walks past. + // Inlined (not via newRoundWalkRejectServer) so we can count upgrade + // hits and pin the stickiness invariant below. + var rejectHits atomic.Int64 + rejectSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + rejectHits.Add(1) + w.Header().Add("X-QuestDB-Role", "REPLICA") + w.WriteHeader(421) + })) + defer rejectSrv.Close() + // Host 1: SF-compatible test server that ACKs frames. + primarySrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer primarySrv.Close() + + sfDir := t.TempDir() + addr0 := strings.TrimPrefix(rejectSrv.URL, "http://") + addr1 := strings.TrimPrefix(primarySrv.URL, "http://") + conf := fmt.Sprintf( + "ws::addr=%s,%s;sf_dir=%s;sender_id=t;close_flush_timeout_millis=2000;", + addr0, addr1, sfDir, + ) + + sender, err := LineSenderFromConf(context.Background(), conf) + require.NoError(t, err) + defer func() { _ = sender.Close(context.Background()) }() + + // Batch 1 — establishes the bind on host 1. + require.NoError(t, sender.Table("t").Int64Column("v", 1).AtNow(context.Background())) + require.NoError(t, sender.Flush(context.Background())) + require.Eventually(t, func() bool { + return primarySrv.totalFramesReceived.Load() >= int64(1) + }, 2*time.Second, 1*time.Millisecond, + "batch 1 must reach the primary peer") + framesAfter1 := primarySrv.totalFramesReceived.Load() + + // Batch 2 — must also land on host 1 (no proactive rotation). + require.NoError(t, sender.Table("t").Int64Column("v", 2).AtNow(context.Background())) + require.NoError(t, sender.Flush(context.Background())) + require.Eventually(t, func() bool { + return primarySrv.totalFramesReceived.Load() > framesAfter1 + }, 2*time.Second, 1*time.Millisecond, + "batch 2 must also reach the same primary peer (stickiness)") + + // Stickiness invariant: the rejecter was touched exactly once — + // by the initial SF round-walk. A regressed sender that re-walks + // the full ring on every flush would have hit it again before + // batch 2 (or before each frame), so > 1 would mean the + // stickiness property has regressed. + assert.Equal(t, int64(1), rejectHits.Load(), + "rejecting host must be touched only by the initial round-walk") +} diff --git a/qwp_sf_segment.go b/qwp_sf_segment.go new file mode 100644 index 00000000..dbd17554 --- /dev/null +++ b/qwp_sf_segment.go @@ -0,0 +1,544 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "errors" + "fmt" + "hash/crc32" + "math" + "os" + "sync/atomic" + "time" +) + +// qwpSf* constants describe the on-disk store-and-forward segment +// format. The layout matches the Java client (`MmapSegment.java`) +// exactly so segments are interchangeable with the Java client when +// sharing an SF group root. +// +// On-disk layout — header and frame format: +// +// [u32 magic 'SF01'] [u8 ver=1] [u8 flags=0] [u16 reserved=0] +// [u64 baseSeq] [u64 createdMicros] 24-byte header +// frame, frame, ... each frame: +// [u32 crc32c] +// [u32 payloadLen] +// [payloadLen bytes] +// crc32c covers (payloadLen, payload). +const ( + qwpSfFileMagic uint32 = 0x31304653 // 'SF01' little-endian on disk + qwpSfFrameHeaderSize int64 = 8 // u32 crc + u32 payloadLen + qwpSfHeaderSize int64 = 24 // total file header + qwpSfSegmentVersion byte = 1 +) + +// qwpSfCrcTable is the CRC32C (Castagnoli) polynomial table shared +// across SF segment writers and readers. Allocated once. +var qwpSfCrcTable = crc32.MakeTable(crc32.Castagnoli) + +// qwpSfErrLockBusy is returned by qwpSfFlockExclusive when another +// process already holds the lock. Matches Java's "sf slot already in +// use" error path; callers map it to a more informative message after +// reading the holder PID payload. +// +//lint:ignore ST1012 prefix kept for grouping with other qwpSf* errors +var qwpSfErrLockBusy = errors.New("qwp/sf: lock busy") + +// qwpSfErrSegmentFull is returned by qwpSfSegment.tryAppend when +// the requested frame won't fit in the segment's remaining capacity. +// The caller (the ring) is expected to rotate to a fresh segment and +// retry; if the payload still doesn't fit, the ring returns +// qwpSfPayloadTooLarge to its caller. +// +//lint:ignore ST1012 prefix kept for grouping with other qwpSf* errors +var qwpSfErrSegmentFull = errors.New("qwp/sf: segment full") + +// qwpSfSegment is one mmap-backed (or in-memory) SF segment. The +// producer thread (single user goroutine) appends frames into the +// mapping; the I/O thread (single consumer goroutine) reads up to +// publishedOffset() for wire send. No locks; the cursor pair +// (appendCursor / publishedCursor) is the only cross-thread +// coordination, and publishedCursor is the publish barrier — the +// consumer MUST NOT read any byte at offset >= publishedOffset(). +// +// The mapping is sized at construction and never grows. When tryAppend +// returns qwpSfErrSegmentFull the caller must rotate to a fresh +// segment. Closing the segment unmaps and closes the file; data +// already written is durable under the page cache (and recoverable +// across process restarts) — call msync for OS-crash durability. +type qwpSfSegment struct { + path string + sizeBytes int64 + memoryBacked bool + + // file is nil for memory-backed segments. For file-backed segments + // it is held for the segment's lifetime so munmap can run before + // close. POSIX guarantees the mapping persists after close, but + // holding the handle keeps the contract uniform with Windows. + file *os.File + + // buf is the mmap'd or malloc'd backing store; len(buf) == sizeBytes. + buf []byte + + // appendCursor is written only by the producer — it's the + // reservation cursor. Plain int64; the producer is single-threaded + // against this segment. + appendCursor int64 + + // baseSeq is provisional at create time, finalized by rebaseSeq() + // at rotation time. Mutable to support the segment manager's + // hot-spare design — spares are pre-created before the producer + // knows what baseSeq the new active will need. Plain field; + // rebaseSeq() is called on the producer thread before any + // cross-thread reader can observe the new identity. + baseSeq int64 + + // frameCount: number of frames successfully appended. Single + // writer (the producer thread in tryAppend); read cross-thread by + // the I/O thread via the ring's findSegmentContaining and lastSeq + // computations on the active segment. Atomic for cross-thread + // visibility. + frameCount atomic.Int64 + + // publishedCursor: written by producer, read by consumer (I/O + // thread). Atomic because the consumer must see writes in + // publication order — once the producer bumps publishedCursor, + // every byte before it is fully written. + publishedCursor atomic.Int64 + + // tornTailBytes is the byte count between the last valid frame and + // the file end that look like an attempted-but-invalid frame write + // (non-zero bytes at the bail-out position). Zero for fresh + // segments and for cleanly partially-filled segments (uninitialised + // tail). Set only by qwpSfOpenSegment; visible to recovery callers + // for diagnostics. Final after construction. + tornTailBytes int64 +} + +// qwpSfCreateSegment creates a fresh segment file at path, +// pre-allocating exactly sizeBytes and mmapping it RW. The 24-byte +// header is written in-place; the cursor lands at qwpSfHeaderSize. +// Returns an error on any I/O failure (openCleanRW, disk full, mmap +// rejected). +// +// Pre-allocation goes through qwpSfAllocate, which owns the +// cross-platform "extend + reserve real disk blocks + never shrinks" +// contract (see qwp_sf_allocate.go). For this call path the file is +// freshly O_TRUNC'd so currentSize == 0 and qwpSfAllocate reserves +// blocks for [0, sizeBytes) and advances EOF to sizeBytes in one +// step. Without the reservation a later store into the mmap'd region +// after the filesystem fills up would deliver SIGBUS (POSIX) / +// STATUS_IN_PAGE_ERROR (Windows), tearing down the process — +// sf-client.md §6 marks block reservation a core invariant of the +// create path. +func qwpSfCreateSegment(path string, baseSeq, sizeBytes int64) (*qwpSfSegment, error) { + if sizeBytes < qwpSfHeaderSize+qwpSfFrameHeaderSize+1 { + return nil, fmt.Errorf("qwp/sf: sizeBytes too small for header + one minimal frame: %d", sizeBytes) + } + // O_TRUNC discards any prior content at the same path — segment + // files are write-once-then-fixed, so reusing a stale file is + // always an error in the recovery code path; here, on a fresh + // create, truncation is the documented behavior. The post-open + // EOF is 0, which is the precondition qwpSfAllocate's macOS + // reservation (F_PEOFPOSMODE — allocates the requested length + // immediately beyond EOF) needs in order to cover [0, sizeBytes). + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + return nil, fmt.Errorf("qwp/sf: openCleanRW %s: %w", path, err) + } + if err := qwpSfAllocate(f, sizeBytes); err != nil { + _ = f.Close() + _ = os.Remove(path) + return nil, err + } + buf, err := qwpSfMmapRW(f, sizeBytes) + if err != nil { + _ = f.Close() + _ = os.Remove(path) + return nil, err + } + s := &qwpSfSegment{ + path: path, + sizeBytes: sizeBytes, + memoryBacked: false, + file: f, + buf: buf, + appendCursor: qwpSfHeaderSize, + baseSeq: baseSeq, + } + s.publishedCursor.Store(qwpSfHeaderSize) + s.writeHeader(baseSeq) + return s, nil +} + +// qwpSfCreateInMemorySegment creates a memory-backed segment with the +// same on-the-wire layout as qwpSfCreateSegment but without any file. +// Used by the non-SF async ingest path (memory mode) — same cursor +// architecture, no disk involvement; the slice is freed when the +// segment is closed and goes out of scope (the GC reclaims it). +func qwpSfCreateInMemorySegment(baseSeq, sizeBytes int64) (*qwpSfSegment, error) { + if sizeBytes < qwpSfHeaderSize+qwpSfFrameHeaderSize+1 { + return nil, fmt.Errorf("qwp/sf: sizeBytes too small for header + one minimal frame: %d", sizeBytes) + } + buf := make([]byte, sizeBytes) + s := &qwpSfSegment{ + path: "", + sizeBytes: sizeBytes, + memoryBacked: true, + file: nil, + buf: buf, + appendCursor: qwpSfHeaderSize, + baseSeq: baseSeq, + } + s.publishedCursor.Store(qwpSfHeaderSize) + s.writeHeader(baseSeq) + return s, nil +} + +// qwpSfOpenSegment opens an existing segment file for recovery. mmaps +// it RW, validates the header magic / version, then scans frames +// forward verifying each CRC. The first bad CRC (or a frame whose +// declared length runs past the file end) is treated as a torn tail; +// both cursors are positioned at the start of that frame. Returns the +// segment ready for further appends. +// +// If recovery observes a torn tail (bytes at the bail-out position +// are non-zero, indicating an attempted-but-failed frame write), the +// byte count is exposed via tornTailBytes() so operators can detect +// silent truncation from corruption or partial writes. +func qwpSfOpenSegment(path string) (*qwpSfSegment, error) { + st, err := os.Stat(path) + if err != nil { + return nil, fmt.Errorf("qwp/sf: stat %s: %w", path, err) + } + fileSize := st.Size() + if fileSize < qwpSfHeaderSize { + return nil, fmt.Errorf("qwp/sf: file shorter than header: %s size=%d", path, fileSize) + } + f, err := os.OpenFile(path, os.O_RDWR, 0) + if err != nil { + return nil, fmt.Errorf("qwp/sf: openRW %s: %w", path, err) + } + buf, err := qwpSfMmapRW(f, fileSize) + if err != nil { + _ = f.Close() + return nil, err + } + magic := binary.LittleEndian.Uint32(buf[0:4]) + if magic != qwpSfFileMagic { + _ = qwpSfMunmap(buf) + _ = f.Close() + return nil, fmt.Errorf("qwp/sf: bad magic in %s: 0x%x", path, magic) + } + version := buf[4] + if version != qwpSfSegmentVersion { + _ = qwpSfMunmap(buf) + _ = f.Close() + return nil, fmt.Errorf("qwp/sf: unsupported version in %s: %d", path, version) + } + baseSeq := int64(binary.LittleEndian.Uint64(buf[8:16])) + // FSNs are non-negative by construction. A negative baseSeq on disk + // means bit-rot or a hand-edited file — refuse so qwpSfOpenRing's + // per-file skip handles it like any other unreadable .sfa rather + // than feeding the bad value into the unsigned-comparison sort and + // contiguity check (which would place the segment last and trip the + // FSN-gap error, taking the whole recovery down). + if baseSeq < 0 { + _ = qwpSfMunmap(buf) + _ = f.Close() + return nil, fmt.Errorf("qwp/sf: bad baseSeq in %s: %d", path, baseSeq) + } + lastGood := qwpSfScanFrames(buf, fileSize) + count := qwpSfCountFrames(buf, lastGood) + tornTail := qwpSfDetectTornTail(buf, lastGood, fileSize) + s := &qwpSfSegment{ + path: path, + sizeBytes: fileSize, + memoryBacked: false, + file: f, + buf: buf, + appendCursor: lastGood, + baseSeq: baseSeq, + tornTailBytes: tornTail, + } + s.publishedCursor.Store(lastGood) + s.frameCount.Store(count) + return s, nil +} + +// writeHeader populates the 24-byte file header at offset 0. +// Producer-only; called from constructors and rebaseSeq. +func (s *qwpSfSegment) writeHeader(baseSeq int64) { + binary.LittleEndian.PutUint32(s.buf[0:4], qwpSfFileMagic) + s.buf[4] = qwpSfSegmentVersion + s.buf[5] = 0 // flags + binary.LittleEndian.PutUint16(s.buf[6:8], 0) // reserved + binary.LittleEndian.PutUint64(s.buf[8:16], uint64(baseSeq)) + binary.LittleEndian.PutUint64(s.buf[16:24], uint64(time.Now().UnixMicro())) +} + +// address returns a slice view of the underlying mapped buffer. The +// returned slice's length == sizeBytes; reads past publishedOffset() +// are not safe (the producer may be mid-write). +func (s *qwpSfSegment) address() []byte { + return s.buf +} + +// segmentBaseSeq returns the segment's current baseSeq. Called +// cross-thread by the I/O loop; safe because baseSeq is set at +// construction or rebaseSeq() (producer thread) before the segment +// becomes visible to readers — and is never further mutated. +func (s *qwpSfSegment) segmentBaseSeq() int64 { + return s.baseSeq +} + +// capacityRemaining returns bytes available for further appends, +// accounting for the per-frame 8-byte envelope a future tryAppend +// would also write. This is payload bytes the caller can still fit, +// NOT raw remaining-mapping bytes. +func (s *qwpSfSegment) capacityRemaining() int64 { + left := s.sizeBytes - s.appendCursor - qwpSfFrameHeaderSize + if left < 0 { + return 0 + } + return left +} + +// isFull reports whether tryAppend would refuse any non-empty frame. +func (s *qwpSfSegment) isFull() bool { + return s.capacityRemaining() <= 0 +} + +// publishedOffset returns the bytes safely written and visible to the +// consumer. Reading any byte at offset >= publishedOffset() from the +// mapping is undefined — the producer may be mid-write. +func (s *qwpSfSegment) publishedOffset() int64 { + return s.publishedCursor.Load() +} + +// segmentFrameCount returns the number of frames written since +// create (or recovered by openExisting). Used by the ring to compute +// lastSeq = baseSeq + frameCount - 1 for ACK / trim decisions. +func (s *qwpSfSegment) segmentFrameCount() int64 { + return s.frameCount.Load() +} + +// rebaseSeq re-stamps the segment's baseSeq, both in memory and in +// the on-disk header at offset 8. Used by the ring at rotation time +// to pin the segment's identity once the active's frame count is +// final (the segment manager pre-creates spares with a provisional +// baseSeq that may be stale by rotation time). Returns an error if +// any frames have already been appended — a rebase after first append +// would corrupt the FSN sequence. +func (s *qwpSfSegment) rebaseSeq(newBaseSeq int64) error { + if s.frameCount.Load() > 0 { + return fmt.Errorf("qwp/sf: cannot rebase: segment has %d frame(s) already appended", + s.frameCount.Load()) + } + s.baseSeq = newBaseSeq + binary.LittleEndian.PutUint64(s.buf[8:16], uint64(newBaseSeq)) + return nil +} + +// tryAppend appends one frame: writes [crc32c | u32 payloadLen | payload] +// starting at the current append cursor, then advances both cursors +// (publishedCursor last via atomic store, so the consumer never sees +// a partial frame). Returns the offset of the appended frame on +// success, or qwpSfErrSegmentFull if the remaining capacity cannot +// fit qwpSfFrameHeaderSize + payloadLen. +// +// This is the producer thread's hot path. No syscall, no allocation; +// just a CRC pass and a copy into the mapped region. +func (s *qwpSfSegment) tryAppend(payload []byte) (int64, error) { + payloadLen := int64(len(payload)) + if payloadLen < 0 { + return 0, fmt.Errorf("qwp/sf: negative payloadLen: %d", payloadLen) + } + // The on-disk length is a u32 read back as int32 by the recovery + // scanner (qwpSfScanFrames), so any value with bit 31 set would + // round-trip as negative and be rejected as a torn tail. Bracket + // the writer to the reader's tolerance so a too-large frame fails + // here instead of corrupting the segment. + if payloadLen > math.MaxInt32 { + return 0, fmt.Errorf("qwp/sf: payloadLen exceeds int32: %d", payloadLen) + } + total := qwpSfFrameHeaderSize + payloadLen + offset := s.appendCursor + if offset+total > s.sizeBytes { + return 0, qwpSfErrSegmentFull + } + // Frame layout: [u32 crc][u32 payloadLen][payload]. + // Length goes first so the CRC pass can include it without + // recomputing offsets. + binary.LittleEndian.PutUint32(s.buf[offset+4:offset+8], uint32(payloadLen)) + if payloadLen > 0 { + copy(s.buf[offset+qwpSfFrameHeaderSize:offset+total], payload) + } + // CRC32C over (payloadLen, payload). Recovery scans validate each + // frame by recomputing this CRC over the on-disk bytes. + crc := crc32.Update(0, qwpSfCrcTable, s.buf[offset+4:offset+8]) + if payloadLen > 0 { + crc = crc32.Update(crc, qwpSfCrcTable, s.buf[offset+qwpSfFrameHeaderSize:offset+total]) + } + binary.LittleEndian.PutUint32(s.buf[offset:offset+4], crc) + s.appendCursor = offset + total + s.frameCount.Add(1) + // Publish last. Until this atomic store retires, the consumer + // cannot see any of the bytes we just wrote. + s.publishedCursor.Store(s.appendCursor) + return offset, nil +} + +// msync synchronously flushes dirty pages of [HEADER_SIZE, +// publishedOffset()) to disk via msync(MS_SYNC). Off the hot path — +// call only when the user has opted into OS-crash durability. No-op +// for memory-backed segments. +func (s *qwpSfSegment) msync() error { + if s.memoryBacked { + return nil + } + pub := s.publishedCursor.Load() + if pub > qwpSfHeaderSize { + return qwpSfMsync(s.buf, pub) + } + return nil +} + +// close unmaps the buffer and closes the underlying file. Safe to +// call on a segment that has been partially constructed (e.g. after +// a failed mmap during qwpSfOpenSegment); fields that were never +// initialised are nil and we skip them. +func (s *qwpSfSegment) close() error { + var firstErr error + if !s.memoryBacked && s.buf != nil { + if err := qwpSfMunmap(s.buf); err != nil { + firstErr = err + } + } + s.buf = nil + if s.file != nil { + if err := s.file.Close(); err != nil && firstErr == nil { + firstErr = err + } + s.file = nil + } + return firstErr +} + +// segmentPath returns the file path the segment was created from / +// opened against. Empty for memory-backed segments. +func (s *qwpSfSegment) segmentPath() string { + return s.path +} + +// segmentSize returns the configured segment size in bytes — the +// total allocation, not the published portion. +func (s *qwpSfSegment) segmentSize() int64 { + return s.sizeBytes +} + +// segmentTornTailBytes returns the byte count between the last valid +// frame and the file end that look like an attempted-but-invalid +// frame write — set by qwpSfOpenSegment when recovery observes +// non-zero bytes past the bail-out point. Zero for fresh segments, +// memory-backed segments, and cleanly partially-filled recovered +// segments. Operators / tests can read this to tell silent +// truncation (corruption) from a normal partial fill (no incident). +func (s *qwpSfSegment) segmentTornTailBytes() int64 { + return s.tornTailBytes +} + +// qwpSfScanFrames is a forward scan that returns the offset just past +// the last frame whose CRC verifies. A torn-tail frame (declared +// length runs past EOF, or CRC mismatch) leaves both cursors at the +// start of that frame; the next tryAppend will overwrite it. The +// scan only reads from the mapping — no syscalls. +func qwpSfScanFrames(buf []byte, fileSize int64) int64 { + pos := qwpSfHeaderSize + for pos+qwpSfFrameHeaderSize <= fileSize { + crcRead := binary.LittleEndian.Uint32(buf[pos : pos+4]) + payloadLen := int64(int32(binary.LittleEndian.Uint32(buf[pos+4 : pos+8]))) + // Defensive: a corrupt length field could be enormous or + // negative, both of which would otherwise overrun the mapping. + if payloadLen < 0 || pos+qwpSfFrameHeaderSize+payloadLen > fileSize { + return pos + } + crcCalc := crc32.Update(0, qwpSfCrcTable, buf[pos+4:pos+8]) + if payloadLen > 0 { + crcCalc = crc32.Update(crcCalc, qwpSfCrcTable, buf[pos+qwpSfFrameHeaderSize:pos+qwpSfFrameHeaderSize+payloadLen]) + } + if crcCalc != crcRead { + return pos + } + pos += qwpSfFrameHeaderSize + payloadLen + } + return pos +} + +// qwpSfDetectTornTail distinguishes "torn tail" (writer attempted a +// write past the last valid frame and failed — partial write, +// mid-stream corruption, bit rot) from clean unwritten space +// (manager-allocated segment with zero-filled tail). Returns the byte +// count from lastGood to fileSize when the bytes at the bail-out +// frame header are non-zero, else 0. +// +// Heuristic but robust for the common cases: qwpSfCreateSegment +// truncates the file to size, leaving the tail zero-filled; the +// writer only writes non-zero bytes via tryAppend, which writes the +// CRC and length fields together. So a non-zero byte at the +// failed-frame position implies an attempted write — exactly the +// case operators want flagged. +func qwpSfDetectTornTail(buf []byte, lastGood, fileSize int64) int64 { + if lastGood >= fileSize { + return 0 + } + probe := qwpSfFrameHeaderSize + if fileSize-lastGood < probe { + probe = fileSize - lastGood + } + for i := int64(0); i < probe; i++ { + if buf[lastGood+i] != 0 { + return fileSize - lastGood + } + } + return 0 +} + +// qwpSfCountFrames counts frames in [HEADER_SIZE, lastGood). Walks +// the framing in lockstep with qwpSfScanFrames (which already +// validated CRCs); so this is just length-driven traversal, no CRC +// re-check. +func qwpSfCountFrames(buf []byte, lastGood int64) int64 { + pos := qwpSfHeaderSize + count := int64(0) + for pos < lastGood { + payloadLen := int64(int32(binary.LittleEndian.Uint32(buf[pos+4 : pos+8]))) + pos += qwpSfFrameHeaderSize + payloadLen + count++ + } + return count +} diff --git a/qwp_sf_segment_test.go b/qwp_sf_segment_test.go new file mode 100644 index 00000000..9ec3e40a --- /dev/null +++ b/qwp_sf_segment_test.go @@ -0,0 +1,487 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "errors" + "hash/crc32" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQwpSfSegmentCreateRoundtrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-test.sfa") + + const segSize int64 = 4096 + seg, err := qwpSfCreateSegment(path, 100, segSize) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + assert.Equal(t, int64(100), seg.segmentBaseSeq()) + assert.Equal(t, int64(0), seg.segmentFrameCount()) + assert.Equal(t, qwpSfHeaderSize, seg.publishedOffset()) + assert.Equal(t, segSize, seg.segmentSize()) + assert.False(t, seg.isFull()) + assert.Equal(t, int64(0), seg.segmentTornTailBytes()) + + // On-disk header must be readable and well-formed even before any + // frames are appended. + f, err := os.Open(path) + require.NoError(t, err) + hdr := make([]byte, qwpSfHeaderSize) + _, err = f.Read(hdr) + require.NoError(t, err) + require.NoError(t, f.Close()) + assert.Equal(t, qwpSfFileMagic, binary.LittleEndian.Uint32(hdr[0:4])) + assert.Equal(t, qwpSfSegmentVersion, hdr[4]) + assert.Equal(t, byte(0), hdr[5]) + assert.Equal(t, uint16(0), binary.LittleEndian.Uint16(hdr[6:8])) + assert.Equal(t, uint64(100), binary.LittleEndian.Uint64(hdr[8:16])) +} + +func TestQwpSfSegmentTryAppend(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-append.sfa") + + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + payload := []byte("hello qwp sf") + off, err := seg.tryAppend(payload) + require.NoError(t, err) + assert.Equal(t, qwpSfHeaderSize, off) + assert.Equal(t, int64(1), seg.segmentFrameCount()) + expectedPub := qwpSfHeaderSize + qwpSfFrameHeaderSize + int64(len(payload)) + assert.Equal(t, expectedPub, seg.publishedOffset()) + + // Verify on-disk frame layout: [crc32c | u32 len | payload]. + buf := seg.address() + storedLen := binary.LittleEndian.Uint32(buf[off+4 : off+8]) + assert.Equal(t, uint32(len(payload)), storedLen) + storedCrc := binary.LittleEndian.Uint32(buf[off : off+4]) + + expectedCrc := crc32.Update(0, qwpSfCrcTable, buf[off+4:off+8]) + expectedCrc = crc32.Update(expectedCrc, qwpSfCrcTable, payload) + assert.Equal(t, expectedCrc, storedCrc) + assert.Equal(t, payload, buf[off+qwpSfFrameHeaderSize:off+qwpSfFrameHeaderSize+int64(len(payload))]) +} + +func TestQwpSfSegmentTryAppendUntilFull(t *testing.T) { + const segSize int64 = 256 + seg, err := qwpSfCreateInMemorySegment(0, segSize) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + payload := []byte("abcdefgh") // 8 bytes + want := int64(0) + for { + _, err := seg.tryAppend(payload) + if errors.Is(err, qwpSfErrSegmentFull) { + break + } + require.NoError(t, err) + want++ + } + assert.Equal(t, want, seg.segmentFrameCount()) + assert.True(t, seg.isFull()) + // Subsequent attempts keep returning the sentinel without + // corrupting state. + _, err = seg.tryAppend(payload) + assert.ErrorIs(t, err, qwpSfErrSegmentFull) + assert.Equal(t, want, seg.segmentFrameCount()) +} + +func TestQwpSfSegmentInMemoryHasNoFile(t *testing.T) { + seg, err := qwpSfCreateInMemorySegment(42, 4096) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + assert.True(t, seg.memoryBacked) + assert.Equal(t, "", seg.segmentPath()) + assert.Nil(t, seg.file) + // Header must still be readable from the malloc'd buffer. + buf := seg.address() + assert.Equal(t, qwpSfFileMagic, binary.LittleEndian.Uint32(buf[0:4])) + assert.Equal(t, uint64(42), binary.LittleEndian.Uint64(buf[8:16])) +} + +func TestQwpSfSegmentRebaseSeq(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-rebase.sfa") + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + require.NoError(t, seg.rebaseSeq(7777)) + assert.Equal(t, int64(7777), seg.segmentBaseSeq()) + // Header on disk must reflect the rebase. + buf := seg.address() + assert.Equal(t, uint64(7777), binary.LittleEndian.Uint64(buf[8:16])) + + // Once a frame is appended, rebase must reject. + _, err = seg.tryAppend([]byte{1, 2, 3}) + require.NoError(t, err) + err = seg.rebaseSeq(9999) + require.Error(t, err) + assert.Contains(t, err.Error(), "cannot rebase") +} + +func TestQwpSfSegmentRecovery(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-recover.sfa") + + { + seg, err := qwpSfCreateSegment(path, 50, 4096) + require.NoError(t, err) + for i := 0; i < 3; i++ { + _, err := seg.tryAppend([]byte{byte(i), byte(i + 1), byte(i + 2)}) + require.NoError(t, err) + } + require.NoError(t, seg.close()) + } + + seg, err := qwpSfOpenSegment(path) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + assert.Equal(t, int64(50), seg.segmentBaseSeq()) + assert.Equal(t, int64(3), seg.segmentFrameCount()) + // publishedOffset should point past the third frame. + expectedPub := qwpSfHeaderSize + 3*(qwpSfFrameHeaderSize+3) + assert.Equal(t, expectedPub, seg.publishedOffset()) + assert.Equal(t, int64(0), seg.segmentTornTailBytes()) +} + +func TestQwpSfSegmentRecoveryRejectsBadMagic(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-badmagic.sfa") + + // Create a file with a wrong magic. + require.NoError(t, os.WriteFile(path, make([]byte, 4096), 0o644)) + seg, err := qwpSfOpenSegment(path) + require.Error(t, err) + assert.Contains(t, err.Error(), "bad magic") + assert.Nil(t, seg) +} + +func TestQwpSfSegmentRecoveryRejectsBadVersion(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-badver.sfa") + + { + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + // Poke a bad version byte before close. + seg.address()[4] = 99 + require.NoError(t, seg.close()) + } + + seg, err := qwpSfOpenSegment(path) + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported version") + assert.Nil(t, seg) +} + +func TestQwpSfSegmentRecoveryHandlesTornTail(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-torntail.sfa") + + { + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + _, err = seg.tryAppend([]byte("good frame")) + require.NoError(t, err) + // Simulate a torn write: corrupt the bytes immediately past + // the last good frame so detectTornTail flags it. We write + // non-zero garbage into what looks like a frame header. + buf := seg.address() + off := seg.publishedOffset() + binary.LittleEndian.PutUint32(buf[off:off+4], 0xDEADBEEF) + binary.LittleEndian.PutUint32(buf[off+4:off+8], 0x1000) // claims a 4 KiB payload + require.NoError(t, seg.close()) + } + + seg, err := qwpSfOpenSegment(path) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + assert.Equal(t, int64(1), seg.segmentFrameCount()) + assert.Greater(t, seg.segmentTornTailBytes(), int64(0)) + // publishedOffset must land at the start of the broken frame so + // future appends overwrite it. + expected := qwpSfHeaderSize + qwpSfFrameHeaderSize + int64(len("good frame")) + assert.Equal(t, expected, seg.publishedOffset()) +} + +func TestQwpSfSegmentRecoveryHandlesCleanPartialFill(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-clean.sfa") + + { + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + _, err = seg.tryAppend([]byte("partial fill")) + require.NoError(t, err) + require.NoError(t, seg.close()) + } + + seg, err := qwpSfOpenSegment(path) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + // Trailing zero bytes are NOT a torn tail. + assert.Equal(t, int64(0), seg.segmentTornTailBytes()) +} + +func TestQwpSfSegmentRecoveryRejectsNegativeBaseSeq(t *testing.T) { + // FSNs are non-negative by construction. A negative baseSeq on disk + // means bit-rot or a hand-edited file; recovery must refuse it + // rather than feeding the bad value into the unsigned-comparison + // sort and contiguity check, which would place the segment last + // and trip the FSN-gap error. + dir := t.TempDir() + path := filepath.Join(dir, "sf-badbase.sfa") + { + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + require.NoError(t, seg.close()) + } + // Rewrite the on-disk baseSeq field at offset 8 to a negative + // value (sign bit set). + f, err := os.OpenFile(path, os.O_RDWR, 0) + require.NoError(t, err) + var bad [8]byte + binary.LittleEndian.PutUint64(bad[:], 0xFFFFFFFFFFFFFFFF) // int64(-1) + _, err = f.WriteAt(bad[:], 8) + require.NoError(t, err) + require.NoError(t, f.Close()) + + seg, err := qwpSfOpenSegment(path) + require.Error(t, err) + assert.Nil(t, seg) + assert.Contains(t, err.Error(), "bad baseSeq") +} + +func TestQwpSfSegmentRecoveryRejectsOversizedLength(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-bad.sfa") + + { + seg, err := qwpSfCreateSegment(path, 0, 256) + require.NoError(t, err) + // Write a frame that claims a payload larger than the file. + buf := seg.address() + binary.LittleEndian.PutUint32(buf[qwpSfHeaderSize:qwpSfHeaderSize+4], 0xAAAAAAAA) + binary.LittleEndian.PutUint32(buf[qwpSfHeaderSize+4:qwpSfHeaderSize+8], 0xFFFFFFFF) + require.NoError(t, seg.close()) + } + + seg, err := qwpSfOpenSegment(path) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + // Corrupt frame is treated as a torn tail; recovery stops at the + // header position, so frameCount is 0 and lastGood == HEADER_SIZE. + assert.Equal(t, int64(0), seg.segmentFrameCount()) + assert.Equal(t, qwpSfHeaderSize, seg.publishedOffset()) +} + +func TestQwpSfSegmentMsync(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sf-msync.sfa") + seg, err := qwpSfCreateSegment(path, 0, 4096) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + _, err = seg.tryAppend([]byte("durable")) + require.NoError(t, err) + require.NoError(t, seg.msync()) +} + +func TestQwpSfSegmentMsyncMemoryBackedIsNoop(t *testing.T) { + seg, err := qwpSfCreateInMemorySegment(0, 4096) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + _, err = seg.tryAppend([]byte("ram")) + require.NoError(t, err) + require.NoError(t, seg.msync()) +} + +func TestQwpSfSegmentTooSmallSize(t *testing.T) { + _, err := qwpSfCreateInMemorySegment(0, qwpSfHeaderSize) + require.Error(t, err) + assert.Contains(t, err.Error(), "too small") +} + +func TestQwpSfFlockExclusive(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, ".lock") + + f1, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o644) + require.NoError(t, err) + defer func() { _ = f1.Close() }() + require.NoError(t, qwpSfFlockExclusive(f1)) + + f2, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o644) + require.NoError(t, err) + defer func() { _ = f2.Close() }() + err = qwpSfFlockExclusive(f2) + assert.ErrorIs(t, err, qwpSfErrLockBusy) + + require.NoError(t, f1.Close()) + // Re-acquire on f2 now that f1 has released. + require.NoError(t, qwpSfFlockExclusive(f2)) +} + +// TestQwpSfSegmentGoldenFileJavaConformance is the Java<->Go .sfa +// golden-file conformance guard for CLAUDE.md's on-disk compatibility +// claim: a segment file written by either client must be byte-readable +// by the other. The "golden" is a canonical .sfa image laid out by hand +// from the format documented on the Java MmapSegment.java (FILE_MAGIC, +// HEADER_SIZE, FRAME_HEADER_SIZE, VERSION, baseSeq, CRC32C over +// (payloadLen, payload)) — built independently of the production +// qwpSfSegment codec so it pins all three directions of drift: +// +// 1. The format constants still equal the Java MmapSegment literals. +// 2. The Go reader (qwpSfOpenSegment) recovers a hand-built image. +// 3. The Go writer (qwpSfCreateSegment + tryAppend) reproduces the +// image byte-for-byte, except the non-deterministic createdMicros +// header field. +// +// CRC32C (Castagnoli) is a standardised checksum, so the in-test stdlib +// crc32 and the Java client's Crc32c necessarily agree on the same +// bytes; the conformance therefore rests on the byte layout, which this +// test pins explicitly. A switch to a different polynomial or endianness +// on either side trips the reader or writer sub-test. +func TestQwpSfSegmentGoldenFileJavaConformance(t *testing.T) { + // 1. Format constants must equal the Java MmapSegment.java literals. + assert.Equal(t, uint32(0x31304653), qwpSfFileMagic, "'SF01' little-endian") + assert.Equal(t, int64(24), qwpSfHeaderSize) + assert.Equal(t, int64(8), qwpSfFrameHeaderSize) + assert.Equal(t, byte(1), qwpSfSegmentVersion) + + // Canonical input: a non-zero baseSeq and two frames of differing + // length so a length-handling drift is visible. + const goldenBaseSeq = int64(7) + // A fixed createdMicros keeps the golden image deterministic; the + // production writer stamps time.Now(), checked separately below. + const goldenCreatedMicros = int64(1_700_000_000_000_000) + goldenFrames := [][]byte{[]byte("hello"), []byte("QWP!")} + + crcTable := crc32.MakeTable(crc32.Castagnoli) + + // Build the golden .sfa image by hand from the documented layout. + golden := make([]byte, qwpSfHeaderSize) + binary.LittleEndian.PutUint32(golden[0:4], 0x31304653) // magic 'SF01' + golden[4] = 1 // version + golden[5] = 0 // flags + binary.LittleEndian.PutUint16(golden[6:8], 0) // reserved + binary.LittleEndian.PutUint64(golden[8:16], uint64(goldenBaseSeq)) + binary.LittleEndian.PutUint64(golden[16:24], uint64(goldenCreatedMicros)) + for _, p := range goldenFrames { + frame := make([]byte, qwpSfFrameHeaderSize+int64(len(p))) + binary.LittleEndian.PutUint32(frame[4:8], uint32(len(p))) + copy(frame[8:], p) + // CRC32C covers (payloadLen, payload) — frame[4:] here. + crc := crc32.Update(0, crcTable, frame[4:]) + binary.LittleEndian.PutUint32(frame[0:4], crc) + golden = append(golden, frame...) + } + + // 2. Reader: a hand-built (cross-impl) image must be recovered intact. + t.Run("Go reader accepts the golden image", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "golden.sfa") + require.NoError(t, os.WriteFile(path, golden, 0o644)) + + seg, err := qwpSfOpenSegment(path) + require.NoError(t, err) + defer func() { _ = seg.close() }() + + assert.Equal(t, goldenBaseSeq, seg.segmentBaseSeq()) + assert.Equal(t, int64(len(goldenFrames)), seg.segmentFrameCount()) + assert.Equal(t, int64(0), seg.segmentTornTailBytes(), + "a clean golden image must report no torn tail") + assert.Equal(t, int64(len(golden)), seg.publishedOffset(), + "recovery must position the cursor just past the last valid frame") + + // Walk the frames back out of the mapping and confirm payloads. + buf := seg.address() + off := qwpSfHeaderSize + for i, p := range goldenFrames { + payloadLen := int64(binary.LittleEndian.Uint32(buf[off+4 : off+8])) + require.Equalf(t, int64(len(p)), payloadLen, "frame %d payloadLen", i) + got := buf[off+qwpSfFrameHeaderSize : off+qwpSfFrameHeaderSize+payloadLen] + assert.Equalf(t, p, got, "frame %d payload", i) + off += qwpSfFrameHeaderSize + payloadLen + } + }) + + // 3. Writer: the production writer must reproduce the golden image, + // modulo the non-deterministic createdMicros header field. + t.Run("Go writer reproduces the golden image", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "written.sfa") + const segSize int64 = 4096 + + seg, err := qwpSfCreateSegment(path, goldenBaseSeq, segSize) + require.NoError(t, err) + for _, p := range goldenFrames { + _, err := seg.tryAppend(p) + require.NoError(t, err) + } + require.NoError(t, seg.close()) + + written, err := os.ReadFile(path) + require.NoError(t, err) + require.Equal(t, int(segSize), len(written), + "create pre-allocates the full segment size") + + // Header: everything except createdMicros[16:24] is deterministic. + assert.Equal(t, golden[0:16], written[0:16], + "magic/version/flags/reserved/baseSeq must match the golden header") + gotMicros := int64(binary.LittleEndian.Uint64(written[16:24])) + assert.Greaterf(t, gotMicros, int64(1_600_000_000_000_000), + "createdMicros must be a plausible recent timestamp, got %d", gotMicros) + + // Frames must be byte-identical to the golden image (CRC + len + + // payload). This is what a Java reader would parse. + assert.Equal(t, golden[qwpSfHeaderSize:], written[qwpSfHeaderSize:len(golden)], + "frame bytes (crc + len + payload) must match the golden image") + + // The pre-allocated tail past the last frame is zero-filled. + tail := written[len(golden):] + assert.Equal(t, make([]byte, len(tail)), tail, + "the reserved tail beyond the written frames must be zero-filled") + }) +} diff --git a/qwp_sf_send_loop.go b/qwp_sf_send_loop.go new file mode 100644 index 00000000..a7ba7eec --- /dev/null +++ b/qwp_sf_send_loop.go @@ -0,0 +1,1565 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "encoding/binary" + "errors" + "fmt" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/coder/websocket" +) + +// qwpSf send-loop tunables. The reconnect and backoff defaults match +// the Java CursorWebSocketSendLoop spec. +const ( + // qwpSfDefaultParkInterval caps how long senderLoop sleeps when the + // engine has no new frame and the producer doorbell (wakeSender) + // has not fired. The doorbell drives every steady-state send, so + // this timer only bounds the recovery time of a missed wakeup and + // never gates send latency. The Java spec parks 50µs via + // LockSupport.parkNanos, a cheap futex-style park; re-arming a Go + // time.Timer that often costs a sizable fraction of a core per idle + // sender, so the Go port parks 1ms. Parity of the constant is not + // parity of cost. + qwpSfDefaultParkInterval = 1 * time.Millisecond + qwpSfDefaultReconnectMaxDuration = 5 * time.Minute + qwpSfDefaultReconnectInitialBackoff = 100 * time.Millisecond + qwpSfDefaultReconnectMaxBackoff = 5 * time.Second + qwpSfReconnectLogThrottleInterval = 5 * time.Second // throttle "attempt N failed" logs +) + +// qwpSfMaxSilentConnStrikes is the number of consecutive ACK-less +// connections the never-ACKed terminal heuristic in run() tolerates +// before declaring the server incompatible and stopping retries. A +// single connection that sends frames and is met with silence is +// indistinguishable from a routine server restart or LB RST landing +// in the window between a fresh sender's first frame and its first +// ACK, so that strike triggers an ordinary reconnect+replay. Reaching +// this many strikes means at least one full reconnect+replay cycle +// has also met nothing but silence — strong evidence the server isn't +// speaking our wire-format dialect. Go-only: there is no Java +// counterpart. +const qwpSfMaxSilentConnStrikes = 2 + +// qwpSfReconnectFactory is invoked by the send loop on a wire +// failure to obtain a fresh connected+upgraded transport. idx is +// the host index PickNext returned (see failover.md §2); the +// factory owns the mapping idx → URL, auth headers, and TLS config. +// Single-host factories may ignore idx — they always dial the same +// address. +// +// Implementations should return immediately on terminal errors +// (auth rejection, version mismatch) and let transient errors +// surface as ordinary errors so the caller can apply backoff. The +// "terminal vs transient" classification is delegated to +// qwpSfIsTerminalUpgradeError, which sniffs the error chain for +// the "WebSocket upgrade failed:" sentinel coder/websocket +// produces on non-101 responses. +type qwpSfReconnectFactory func(ctx context.Context, idx int) (*qwpTransport, error) + +// qwpSfSendLoop owns one I/O goroutine that: +// 1. Polls the engine's publishedFsn and walks newly-published +// frames from the engine's segments, sending each as one +// WebSocket binary frame to the server. +// 2. Polls the WebSocket for server ACK frames; on each ACK with +// cumulative wire sequence N, calls +// engine.engineAcknowledge(fsnAtZero+N) so the segment +// manager can trim fully-acked segments. +// 3. On wire failure, runs the configured reconnect policy: +// backoff with jitter up to reconnectMaxDuration, with +// auth-style failures (401/403/non-101 upgrade reject) +// treated as terminal. On reconnect success, repositions the +// cursor at ackedFsn+1 and replays. +// +// No locks on the steady-state path. The producer goroutine writes +// into the engine; the I/O goroutine reads. publishedFsn is the +// volatile publish barrier. +// +// Errors are reported via lastError(); the I/O goroutine sets it +// and exits. Producers polling checkError() surface the failure. +type qwpSfSendLoop struct { + engine *qwpSfCursorEngine + + // transport is the active connection. Replaced on reconnect. + // Loaded by both the send and receive goroutines; the outer + // loop is the only writer (single-writer pattern). + transport atomic.Pointer[qwpTransport] + + // parkInterval bounds how long senderLoop sleeps when the engine + // has no new frame. The common case is event-driven via the wakeup + // doorbell; this is the defense-in-depth fallback poll. See + // qwpSfDefaultParkInterval for why it need not be tight. + parkInterval time.Duration + + // wakeup is a single-slot doorbell rung by the producer (through + // the ring's sendLoopWakeup callback) after each publish so an + // idle senderLoop reacts immediately instead of spinning at + // parkInterval. Mirrors qwpSfSegmentManager.wakeup. Buffered so a + // publish never blocks on a busy/parked loop; extra rings + // coalesce into the one slot (senderLoop drains all ready frames + // per wake, so one token suffices for any backlog). + wakeup chan struct{} + + // reconnectFactory is non-nil when reconnect is enabled. A nil + // factory makes wire failures immediately terminal (legacy, + // matches the Java client's "no reconnect" mode). + reconnectFactory qwpSfReconnectFactory + + reconnectMaxDuration time.Duration + reconnectInitialBackoff time.Duration + reconnectMaxBackoff time.Duration + + // tracker drives the failover.md §13.6 round-walk. Constructed at + // sendLoopSetHostTracker time with the host count, client zone, and + // target filter — both inert on this ingress path, which does not + // route by server role or zone (see qwp_sender_cursor.go). When + // tracker is nil (legacy single-host tests), connectWithBackoff + // falls back to a synthetic 1-host tracker on first need so the + // round-walk machinery is the only code path. + tracker *qwpHostTracker + + // previousIdx is this loop's private slot for the §2.3 + // per-caller mid-stream-demote pattern. After a successful + // connect it holds the bound endpoint index; on pump exit the + // outer run() loop leaves it as-is so the next connectWithBackoff + // can invoke RecordMidStreamFailure(previousIdx) before PickNext. + // connectWithBackoff resets it to the new bound idx on success + // and to -1 after consuming the mid-stream slot. Single-writer + // (the I/O goroutine). + previousIdx int + + // policyResolver chooses Halt vs DropAndContinue per Category. + // Non-nil; defaults are baked in via qwpSfDefaultPolicyFor. + // Atomic pointer because setters can run concurrently with the + // receiver goroutine that reads it on every classified rejection. + policyResolver atomic.Pointer[qwpSfPolicyResolver] + + // dispatcher delivers SenderError payloads asynchronously to the + // user-supplied SenderErrorHandler. Non-nil; uses the default + // loud-not-silent handler if the user did not configure one. + // Atomic pointer for the same reason as policyResolver. + dispatcher atomic.Pointer[qwpSfErrorDispatcher] + + // fsnAtZero is the FSN that wireSeq=0 maps to on the current + // connection. After a reconnect it's set to engine.ackedFsn()+1 + // so server-side ACK math stays aligned with the disk state. + // Producer-side state, single-writer (the send loop), read + // during ACK handling. + fsnAtZero atomic.Int64 + // nextWireSeq is the next wire sequence the send goroutine will + // emit; each frame's wireSeq/fsnSent derive from it. Reset to 0 on + // every reconnect. The send path and the reset paths + // (positionCursorForStart at startup, swapClient on reconnect) are + // serialized — never concurrent — but run on different goroutines, + // so it is atomic for safe publication across those handoffs. + nextWireSeq atomic.Int64 + // highestFullySent is the highest wire sequence whose sendMessage + // has fully returned, or -1 when no frame has finished sending on + // the current connection. Reset to -1 on every reconnect. The + // receiver clamps every incoming ACK's sequence to this ceiling, + // so a non-compliant server's early or forged ACK cannot advance + // ackedFsn over a frame the send goroutine is still reading out of + // the mmap'd segment — which would let the segment manager munmap + // that buffer mid-read (SIGSEGV) — nor over a frame a wire failure + // dropped before delivery (silent loss). nextWireSeq is bumped + // BEFORE the wire write, so it sits one frame too high to serve as + // this ceiling; highestFullySent advances only AFTER the write + // completes. The send goroutine writes it concurrently with the + // receiver goroutine's reads (and the reset paths re-seed it + // between connections), so it must be atomic. + highestFullySent atomic.Int64 + // serverAckedSeq is the highest cumulative wire sequence the server + // has OK-ACK'd on the current connection, or -1 before the first + // ACK. Reset to -1 on every (re)connect alongside highestFullySent. + // Written by the receiver goroutine; read in applyAckWatermark. + // Paired with highestFullySent: the engine's ACK cursor advances to + // the lesser of the two (see applyAckWatermark), reconciling the + // receiver's ACK against the sender's send-completion no matter which + // of the two — written on separate goroutines — lands last. + serverAckedSeq atomic.Int64 + // sendingSegment / sendOffset track the cursor inside the + // engine's segment chain. Producer-only state. + sendingSegment *qwpSfSegment + sendOffset int64 + // replayTargetFsn: snapshot of publishedFsn at swapClient time. + // Frames at FSN ≤ this value are post-reconnect replays; we + // count them via totalFramesReplayed and reset replayTargetFsn + // to -1 once we cross the boundary. Producer-only state. + replayTargetFsn int64 + + // running gates the outer reconnect loop. close() flips it to + // false; inner goroutines observe it via ctx.Done. + running atomic.Bool + + // ctx is the loop's master context; cancel() forces both + // inner goroutines out of any blocking transport calls. + ctx context.Context + cancel context.CancelFunc + + // done is closed when run() returns. + done chan struct{} + wg sync.WaitGroup + + // lastError holds the first terminal error. Atomic pointer so + // the producer can sample it from any goroutine. + lastError atomic.Pointer[error] + + // lastTerminalServerError is the typed-payload sibling to + // lastError. Set when recordFatalServerError is called with a + // fully-populated *SenderError (server-rejection path, WS + // terminal close, auth-terminal upgrade, reconnect-budget + // exhaustion). Independent of lastError so QwpSender accessors + // can return the typed payload without an errors.As walk. + lastTerminalServerError atomic.Pointer[SenderError] + + // Counters. + totalFramesSent atomic.Int64 + totalAcks atomic.Int64 + totalServerErrors atomic.Int64 + totalReconnects atomic.Int64 + totalReconnectAttempts atomic.Int64 + totalFramesReplayed atomic.Int64 + + // framesSentOnConn counts frames written to the wire on the + // current connection (reset on every connection swap). Paired + // with the lifetime totalAcks counter in the silent-drop guard + // in run(): a connection that sends frames yet sees no ACK while + // totalAcks == 0 is a candidate for the "server up but doesn't + // speak our protocol" classification. + framesSentOnConn atomic.Int64 + + // silentConnStrikes counts consecutive connections that sent at + // least one frame and ended while totalAcks was still 0 — i.e. + // ACK-less drops on a sender that has never once been ACK'd. The + // silent-drop guard in run() declares the server incompatible + // (and stops retrying) once this reaches qwpSfMaxSilentConnStrikes; + // a lone restart/RST in the first-frame→first-ACK window stays + // below the threshold and reconnects+replays. No reset is needed: + // the guard's totalAcks == 0 precondition makes this counter + // unreachable — and thus frozen — the moment any ACK lands. + silentConnStrikes atomic.Int64 + + // Reconnect-loop status, exposed so engineAppendBlocking can + // distinguish "wire publishing but slow" from "wire is in the + // retry loop" when the backpressure deadline fires (spec §16). + // outageStartUnixNano is non-zero iff connectWithBackoff is + // currently running; reconnectAttempts is the per-outage counter + // (resets at the start of each connectWithBackoff call). + outageStartUnixNano atomic.Int64 + reconnectAttempts atomic.Int64 + + // onTransportSwap, when non-nil, is invoked from swapClient with + // the freshly bound transport so the sender can refresh + // connection-derived state (currently: the auto_flush_bytes + // clamp derived from X-QWP-Max-Batch-Size). Atomic pointer so + // the producer-side install in the sender constructor cannot + // race the I/O goroutine's reconnect-time read. nil = no + // callback installed (legacy bench harness / drainers). + onTransportSwap atomic.Pointer[func(*qwpTransport)] +} + +// qwpSfNewSendLoop constructs a send loop bound to the given engine +// and (optional) initial transport. +// +// - When transport is non-nil it must already be connected and +// WebSocket-upgraded; the send loop takes ownership and will +// close it on shutdown. +// - When transport is nil, the loop drives the initial dial on +// its I/O goroutine before serving frames — this is the +// `initial_connect_retry=async` path. A nil transport is only +// valid together with a non-nil factory (otherwise there's no +// way for the loop to obtain a connection). +// +// Reconnect is opt-in: a nil factory keeps the legacy "single +// failure is terminal" behavior; a non-nil factory enables retry +// with backoff, capped by the *Reconnect* knobs. +func qwpSfNewSendLoop( + engine *qwpSfCursorEngine, + transport *qwpTransport, + factory qwpSfReconnectFactory, + parkInterval, reconnectMaxDuration, reconnectInitialBackoff, reconnectMaxBackoff time.Duration, +) *qwpSfSendLoop { + if engine == nil { + panic("qwp/sf: engine must be non-nil") + } + if transport == nil && factory == nil { + panic("qwp/sf: nil transport requires a non-nil reconnect factory") + } + if parkInterval <= 0 { + parkInterval = qwpSfDefaultParkInterval + } + if reconnectMaxDuration <= 0 { + reconnectMaxDuration = qwpSfDefaultReconnectMaxDuration + } + if reconnectInitialBackoff <= 0 { + reconnectInitialBackoff = qwpSfDefaultReconnectInitialBackoff + } + if reconnectMaxBackoff <= 0 { + reconnectMaxBackoff = qwpSfDefaultReconnectMaxBackoff + } + ctx, cancel := context.WithCancel(context.Background()) + l := &qwpSfSendLoop{ + engine: engine, + parkInterval: parkInterval, + reconnectFactory: factory, + reconnectMaxDuration: reconnectMaxDuration, + reconnectInitialBackoff: reconnectInitialBackoff, + reconnectMaxBackoff: reconnectMaxBackoff, + ctx: ctx, + cancel: cancel, + done: make(chan struct{}), + wakeup: make(chan struct{}, 1), + replayTargetFsn: -1, + previousIdx: -1, + } + l.policyResolver.Store(&qwpSfPolicyResolver{}) + l.dispatcher.Store(newQwpSfErrorDispatcher(nil, qwpSfDefaultErrorInboxCapacity)) + l.transport.Store(transport) + // Seed the "nothing fully sent yet" / "nothing ACK'd yet" sentinels; + // positionCursorForStart and swapClient re-establish both on every + // (re)connect. + l.highestFullySent.Store(-1) + l.serverAckedSeq.Store(-1) + // Wire the producer's per-publish doorbell. Set here (before + // sendLoopStart and before any producer append) so it satisfies + // the ring's "set once before producing starts" contract, and so + // every construction path — memory and SF — gets it for free. + engine.engineSetSendLoopWakeup(l.wakeSender) + return l +} + +// wakeSender pushes a non-blocking token so a parked senderLoop wakes +// on the very next iteration. Cheap; safe to call from any goroutine; +// idempotent (multiple publishes coalesce into the single slot). +// No-op when a token is already pending. Mirrors +// qwpSfSegmentManager.wakeWorker. +func (l *qwpSfSendLoop) wakeSender() { + select { + case l.wakeup <- struct{}{}: + default: + } +} + +// sendLoopSetOnTransportSwap installs a callback fired by swapClient +// after each successful transport bind (initial sync connect on the +// memory-mode path, and every reconnect on either path). The +// sender uses it to refresh state derived from the upgrade +// response — currently the X-QWP-Max-Batch-Size-derived +// auto_flush_bytes clamp. Idempotent: a later call replaces the +// previous callback. Pass nil to clear. Safe to call before +// sendLoopStart or while the loop is running (atomic install). +// +// The callback runs on whichever goroutine triggered the swap: the +// producer goroutine for the constructor's seed call, the I/O +// goroutine for every reconnect. Implementations must be cheap and +// non-blocking — the swap path is on the wire's critical path. +func (l *qwpSfSendLoop) sendLoopSetOnTransportSwap(cb func(*qwpTransport)) { + if cb == nil { + l.onTransportSwap.Store(nil) + return + } + l.onTransportSwap.Store(&cb) +} + +// sendLoopSetHostTracker installs the failover.md §2 host-health +// tracker. Optional — when not called, the loop builds a 1-host +// implicit tracker on first connectWithBackoff entry so all paths +// converge on the round-walk machinery. initialBoundIdx is the +// host index the caller already bound (e.g. from +// qwpSfConnectWithRetry's initial-sync path); pass -1 when no host +// has been bound yet (initial-async path) or for legacy single-host +// tests. MUST be called before sendLoopStart; not safe to call +// concurrently. +func (l *qwpSfSendLoop) sendLoopSetHostTracker(tracker *qwpHostTracker, initialBoundIdx int) { + l.tracker = tracker + l.previousIdx = initialBoundIdx +} + +// sendLoopSetPolicyResolver replaces the policy resolver used to map +// Categories to Policies. Safe to call any time — the resolver is +// stored atomically and the receiver goroutine picks up the new value +// on its next classified rejection. Pass nil to fall back to spec +// defaults. +func (l *qwpSfSendLoop) sendLoopSetPolicyResolver(r *qwpSfPolicyResolver) { + if r == nil { + r = &qwpSfPolicyResolver{} + } + l.policyResolver.Store(r) +} + +// sendLoopSetErrorHandler replaces the user-supplied SenderErrorHandler +// and the dispatcher's inbox capacity. Safe to call any time — the +// dispatcher is swapped atomically and the previous one is closed +// (its in-flight goroutine drains briefly, then exits). Passing +// handler=nil reverts to the default loud-not-silent handler; +// capacity ≤ 0 keeps the default capacity. +// +// Note: any notifications still queued on the previous dispatcher at +// swap time are subject to its drain timeout — extremely fast swap + +// flood scenarios may lose a notification, matching offer's +// best-effort contract. +// +// Safe to call from within a SenderErrorHandler: old.close() detects +// that it is running on the old dispatcher's own loop goroutine and +// returns without joining itself (see qwpSfErrorDispatcher.close). +func (l *qwpSfSendLoop) sendLoopSetErrorHandler(handler SenderErrorHandler, capacity int) { + if capacity <= 0 { + capacity = qwpSfDefaultErrorInboxCapacity + } + old := l.dispatcher.Swap(newQwpSfErrorDispatcher(handler, capacity)) + if old != nil { + old.close() + } +} + +// sendLoopDispatcher exposes the dispatcher for counter accessors on +// the QwpSender public surface. Safe to call concurrently with +// sendLoopSetErrorHandler — returns whatever dispatcher is current +// at the moment of call. +func (l *qwpSfSendLoop) sendLoopDispatcher() *qwpSfErrorDispatcher { + return l.dispatcher.Load() +} + +// sendLoopStart launches the I/O goroutine. Idempotent — a second +// call panics. +func (l *qwpSfSendLoop) sendLoopStart() { + if !l.running.CompareAndSwap(false, true) { + panic("qwp/sf: send loop already started") + } + // Position cursor at the first unsent FSN before the goroutine + // observes any state. If the walk hits a corrupt frame header, + // latch the error and still spin up the goroutine — its first + // iteration sees running=false and exits cleanly, releasing + // wg/done. Producer-side calls then surface the latched error. + if err := l.positionCursorForStart(); err != nil { + l.recordFatal(err) + } + l.wg.Add(1) + go l.run() +} + +// sendLoopClose stops the I/O goroutine and waits for it to exit. +// Idempotent. Safe to call from any goroutine. +func (l *qwpSfSendLoop) sendLoopClose() error { + l.running.Store(false) + l.cancel() + l.wg.Wait() + if t := l.transport.Swap(nil); t != nil { + _ = t.close() + } + if d := l.dispatcher.Load(); d != nil { + d.close() + } + return l.checkErrorOrNil() +} + +// sendLoopCheckError returns the first terminal error the I/O +// goroutine recorded, or nil. Producers should sample this on +// every public API call so wire failures don't stay silent. +func (l *qwpSfSendLoop) sendLoopCheckError() error { + return l.checkErrorOrNil() +} + +// sendLoopDone returns a channel closed when the I/O goroutine exits — +// on graceful sendLoopClose and on every terminal HALT path alike +// (run() closes it via defer). A blocked AwaitAckedFsn selects on it so +// it stops waiting once ackedFsn can no longer advance, rather than +// hanging until its ctx fires. +func (l *qwpSfSendLoop) sendLoopDone() <-chan struct{} { + return l.done +} + +func (l *qwpSfSendLoop) checkErrorOrNil() error { + if p := l.lastError.Load(); p != nil { + return *p + } + return nil +} + +func (l *qwpSfSendLoop) recordFatal(err error) { + if err == nil { + return + } + l.lastError.CompareAndSwap(nil, &err) + l.running.Store(false) +} + +// recordFatalServerError latches a typed *SenderError as the terminal +// error. It populates both lastError (so producer-side errors.As +// continues to work) and lastTerminalServerError (so the QwpSender +// accessor can return the typed payload directly without an unwrap +// walk). Idempotent — only the first failure wins, matching +// recordFatal's semantics. +// +// Invariant: callers MUST invoke this before dispatcher.offer(se) on +// any HALT path. The dispatcher delivers asynchronously to user +// handlers that may synchronously probe sendLoopCheckError() or call +// Flush; if the latch is written after offer, those probes race and +// can see nil. See qwp-cursor-error-api.md §120 and the Java +// CursorWebSocketSendLoop comments around recordFatal/dispatchError. +func (l *qwpSfSendLoop) recordFatalServerError(se *SenderError) { + if se == nil { + return + } + var err error = se + l.lastError.CompareAndSwap(nil, &err) + l.lastTerminalServerError.CompareAndSwap(nil, se) + l.running.Store(false) +} + +// sendLoopLastTerminalServerError returns the typed *SenderError the +// I/O goroutine latched as terminal, or nil if either no terminal +// error has occurred or the terminal error has no typed payload +// (legacy recordFatal path used for transport-only failures). +func (l *qwpSfSendLoop) sendLoopLastTerminalServerError() *SenderError { + return l.lastTerminalServerError.Load() +} + +// sendLoopTotalServerErrors returns the cumulative count of +// SenderError payloads built by the loop (DROP and HALT combined). +func (l *qwpSfSendLoop) sendLoopTotalServerErrors() int64 { + return l.totalServerErrors.Load() +} + +// sendLoopFsnAtZero returns the FSN that wireSeq=0 maps to on the +// current connection. Useful for tests asserting reconnect +// repositioning. +func (l *qwpSfSendLoop) sendLoopFsnAtZero() int64 { + return l.fsnAtZero.Load() +} + +// sendLoopTotalReconnects returns the count of successful +// reconnects since startup. +func (l *qwpSfSendLoop) sendLoopTotalReconnects() int64 { + return l.totalReconnects.Load() +} + +// sendLoopTotalReconnectAttempts returns reconnect attempts +// (succeeded + failed). +func (l *qwpSfSendLoop) sendLoopTotalReconnectAttempts() int64 { + return l.totalReconnectAttempts.Load() +} + +// sendLoopReconnectStatus reports whether the I/O loop is currently +// inside connectWithBackoff. When reconnecting is true, attempts is +// the per-outage attempt counter (≥ 1) and outageStart is the wall- +// clock time the current outage began. When reconnecting is false, +// attempts is 0 and outageStart is the zero time.Time. +// +// Used by engineAppendBlocking to enrich the backpressure timeout +// error per spec §16: distinguish "publishing but slow" from +// "reconnecting" with attempt count + outage start. +func (l *qwpSfSendLoop) sendLoopReconnectStatus() (reconnecting bool, attempts int64, outageStart time.Time) { + startNanos := l.outageStartUnixNano.Load() + if startNanos == 0 { + return false, 0, time.Time{} + } + return true, l.reconnectAttempts.Load(), time.Unix(0, startNanos) +} + +// sendLoopTotalFramesSent returns the cumulative frame count +// transmitted on the wire. Includes replays. +func (l *qwpSfSendLoop) sendLoopTotalFramesSent() int64 { + return l.totalFramesSent.Load() +} + +// sendLoopTotalAcks returns the cumulative ACK count received. +func (l *qwpSfSendLoop) sendLoopTotalAcks() int64 { + return l.totalAcks.Load() +} + +// sendLoopTotalFramesReplayed returns the cumulative count of +// frames re-emitted on the post-reconnect catch-up window — i.e. +// frames whose FSN was already on the wire before the drop. +func (l *qwpSfSendLoop) sendLoopTotalFramesReplayed() int64 { + return l.totalFramesReplayed.Load() +} + +// positionCursorForStart sets fsnAtZero, nextWireSeq, +// highestFullySent, and the cursor (sendingSegment + sendOffset) to +// the first unsent FSN. Must be called by the I/O goroutine before it +// starts sending — the producer thread captures the engine's state at +// that moment. Returns a non-nil error if the cursor walk hits a +// corrupt frame header; see positionCursorAt. +func (l *qwpSfSendLoop) positionCursorForStart() error { + replayStart := l.engine.engineAckedFsn() + 1 + l.fsnAtZero.Store(replayStart) + l.nextWireSeq.Store(0) + l.highestFullySent.Store(-1) + l.serverAckedSeq.Store(-1) + l.framesSentOnConn.Store(0) + return l.positionCursorAt(replayStart) +} + +// positionCursorAt points the cursor (sendingSegment + sendOffset) at +// the frame for targetFsn. It is called at startup and after every +// reconnect, once fsnAtZero has been reset to targetFsn and nextWireSeq +// to 0. +// +// If targetFsn is already published, the cursor lands exactly on that +// frame. If targetFsn is not published yet, the cursor parks at the +// active segment's current tip and the normal send loop waits for the +// producer to publish more bytes. +// +// Returns a non-nil error if the frame walk hits a corrupt header; see +// positionCursorInSegment. +func (l *qwpSfSendLoop) positionCursorAt(targetFsn int64) error { + seg := l.engine.engineFindSegmentContaining(targetFsn) + if seg == nil { + // No segment currently advertises targetFsn. That normally + // means targetFsn is just past publishedFsn and there is + // nothing to replay yet, so the cursor resumes from the active + // tip. + // + // The producer runs concurrently with this I/O goroutine, + // though: it can publish targetFsn after the lookup above + // returns nil but before (or during) the active-tip snapshot + // below. publishedOffset() reads publishedCursor, which + // tryAppend stores AFTER it increments frameCount — so if this + // read observes the new frame's bytes, the frameCount bump that + // makes targetFsn discoverable is necessarily visible too, and + // the re-check below finds it and lands the cursor exactly on + // targetFsn (keeping wireSeq=0 mapped to targetFsn). Without the + // re-check we would park at the post-publish tip — one frame + // past targetFsn — dropping targetFsn and misnumbering every + // following frame by one, i.e. silent row loss on + // reconnect-under-load (see Java PR #40). If the producer + // publishes only later, both lookups miss, sendOffset stays at + // the old tip, and trySendOne sends the frame normally. + l.sendingSegment = l.engine.engineActiveSegment() + if l.sendingSegment == nil { + l.sendOffset = qwpSfHeaderSize + return nil + } + l.sendOffset = l.sendingSegment.publishedOffset() + if seg = l.engine.engineFindSegmentContaining(targetFsn); seg != nil { + return l.positionCursorInSegment(seg, targetFsn) + } + return nil + } + return l.positionCursorInSegment(seg, targetFsn) +} + +// positionCursorInSegment points sendingSegment/sendOffset at targetFsn +// inside seg, which the caller has already established contains it. +// Segment frame boundaries are not indexed, so it walks payload strides +// from the segment's baseSeq until it reaches targetFsn. +// +// Returns a non-nil error if a frame header along the walk has a +// payloadLen that is negative or that would push the walk past the +// end of the segment buffer — defense-in-depth against a corrupt +// segment that escaped CRC recovery. Without these bounds a +// corrupt-but-positive length (e.g. 0x7FFFFFFF) would overrun offset +// and panic on the next slice index; the panic fires on the +// unrecovered I/O goroutine and crashes the process, bypassing +// recordFatal. Mirrors the bound in qwpSfScanFrames. tryAppend +// validates payloadLen on write and recovery's CRC scan validates it +// on startup, so this is not expected to fire in practice; the callers +// route the returned error through recordFatal. +func (l *qwpSfSendLoop) positionCursorInSegment(seg *qwpSfSegment, targetFsn int64) error { + l.sendingSegment = seg + // Walk frame-by-frame from HEADER_SIZE until we land on targetFsn. + offset := qwpSfHeaderSize + fsn := seg.segmentBaseSeq() + base := seg.address() + segLen := int64(len(base)) + for fsn < targetFsn { + // Bound the header read itself: a prior corrupt stride could + // have left offset within the buffer but with fewer than + // qwpSfFrameHeaderSize bytes remaining. + if offset < qwpSfHeaderSize || offset+qwpSfFrameHeaderSize > segLen { + return fmt.Errorf("qwp/sf: frame header at offset %d overruns segment size %d baseSeq=%d (corrupt segment)", + offset, segLen, seg.segmentBaseSeq()) + } + payloadLen := int64(int32(binary.LittleEndian.Uint32(base[offset+4 : offset+8]))) + // Reject negative and corrupt-but-positive lengths: a stride + // that runs past the buffer would panic the next iteration's + // slice index on the unrecovered I/O goroutine. + if payloadLen < 0 || offset+qwpSfFrameHeaderSize+payloadLen > segLen { + return fmt.Errorf("qwp/sf: invalid payloadLen %d at offset %d in segment baseSeq=%d size=%d (corrupt segment)", + payloadLen, offset, seg.segmentBaseSeq(), segLen) + } + offset += qwpSfFrameHeaderSize + payloadLen + fsn++ + } + l.sendOffset = offset + return nil +} + +// run is the outer reconnect loop. Each iteration runs one +// connection's worth of I/O via runOneConnection; on wire failure +// it backs off and reconnects (if a factory is wired) or records +// the failure as terminal and exits. +// +// When the loop is constructed with a nil transport (the +// `initial_connect_retry=async` path) the very first iteration +// performs the initial dial in-band on this goroutine using the +// same backoff loop as reconnect. Producers that publish before +// the wire is up experience backpressure via engineAppendBlocking; +// terminal initial-connect failures are surfaced via the dispatcher +// and latched as the loop's terminal error. +func (l *qwpSfSendLoop) run() { + defer l.wg.Done() + defer close(l.done) + // Release the active transport on every exit from this loop, + // including a terminal HALT (recordFatal* + offer, then return) + // where no reconnect or Close has swapped it out yet. Without this + // the dead WebSocket — and its server-side connection — would + // linger until the user eventually calls Close(). Idempotent and + // nil-safe: on a clean shutdown sendLoopClose has not yet swapped + // the transport (it does so after wg.Wait), so the swap here wins + // and its later swap sees nil; close() guards a nil conn and pins + // one result via closeOnce. + defer func() { + if t := l.transport.Swap(nil); t != nil { + _ = t.close() + } + }() + + if l.transport.Load() == nil && l.running.Load() { + initial := errors.New("async initial connect deferred to I/O goroutine") + if !l.connectWithBackoff(initial, "initial connect") { + return + } + } + + for l.running.Load() { + err := l.runOneConnection() + if !l.running.Load() { + return + } + // Decide: terminal or recoverable? + if err == nil { + return + } + // Already-terminal SenderErrors come back here from + // receiverLoop's classify branch — route them through + // recordFatalServerError (idempotent) so the typed payload is + // preserved end-to-end. + var alreadyTyped *SenderError + if errors.As(err, &alreadyTyped) { + l.recordFatalServerError(alreadyTyped) + return + } + // WebSocket close-frame violations (PROTOCOL_ERROR 1002, + // UNSUPPORTED_DATA 1003, MESSAGE_TOO_BIG 1009, etc.) come up + // from either inner goroutine via runOneConnection's first- + // error aggregation. They map to ProtocolViolation+Halt; do + // not retry — replaying the same bytes will produce the same + // close frame. + if code := websocket.CloseStatus(err); qwpSfIsTerminalCloseCode(code) { + se := l.qwpSfBuildProtocolViolationSE(code, err.Error()) + l.totalServerErrors.Add(1) + // Latch BEFORE dispatching: a handler that synchronously + // calls Flush / sendLoopCheckError must observe the typed + // terminal error. See qwp-cursor-error-api.md §120. + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return + } + if l.reconnectFactory == nil { + l.recordFatal(err) + return + } + if qwpSfIsTerminalUpgradeError(err) { + se := l.qwpSfBuildUpgradeFailureSE(err) + l.totalServerErrors.Add(1) + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return + } + // Detect "server up, accepts the WS upgrade, but doesn't speak + // our QWP protocol" — the dial succeeds every time, so plain + // reconnect-with-backoff would hammer the server in a hot + // loop until reconnectMaxDuration expires (5 min default), + // burning thousands of ephemeral ports per second. + // + // Gate on *lifetime* ACK history (totalAcks), not the per- + // connection counter: once any ACK has been observed across + // this sender's life, we have proof the server speaks our + // wire-format dialect, so a later silent disconnect is a + // transient outage (LB drain emitting WS 1001 GoingAway, TCP + // RST surfacing as 1006, proxy reset, graceful 1011/1012/ + // 1013 — none of which are flagged terminal by + // qwpSfIsTerminalCloseCode) and reconnect is the right + // reaction. The never-ACK'd case is the terminal candidate + // here: the port-hammering signature is a fresh sender whose + // every dial succeeds and every frame is met with silence, + // repeatedly. The strike-count gate below decides when that + // pattern has repeated enough to be conclusive. + if l.framesSentOnConn.Load() > 0 && l.totalAcks.Load() == 0 { + // This connection finished the WS upgrade and the X-QWP- + // Version negotiation, sent frames, then closed without + // ACKing any of them — and no prior connection on this + // sender has ACK'd anything either. + // + // A single such strike is ambiguous: a routine server + // restart or LB RST landing in the window between a fresh + // sender's first frame and its first ACK produces the + // identical signature, so it counts as a strike and falls + // through to an ordinary reconnect+replay. Reaching + // qwpSfMaxSilentConnStrikes consecutive ACK-less + // connections — at least one full reconnect+replay cycle + // that still met nothing but silence — is conclusive + // evidence the server isn't speaking our wire-format + // dialect (most often: a server build older than this + // client's branch, even if both sides declared the same + // X-QWP-Version). At that point we fail terminally to + // avoid hammering the server with thousands of dial + // attempts per second until reconnectMaxDuration expires. + if l.silentConnStrikes.Add(1) >= qwpSfMaxSilentConnStrikes { + reason := fmt.Sprintf( + "server accepted the WebSocket upgrade but %d consecutive "+ + "connection(s) disconnected without ACKing any of the "+ + "frames we sent — server is likely running an incompatible "+ + "build (won't retry): %s", + l.silentConnStrikes.Load(), err.Error()) + se := l.qwpSfBuildBudgetExhaustedSE(reason) + l.totalServerErrors.Add(1) + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return + } + // Fall through to reconnect+replay. If the next connection + // also sends frames and meets silence the strike count + // crosses the threshold and we HALT then. + } + // Reconnect with backoff. + ok := l.connectWithBackoff(err, "reconnect") + if !ok { + return + } + } +} + +// runOneConnection runs the send + receive goroutines for the +// currently-installed transport until one of them returns. Returns +// the first error seen, or nil for a clean exit (running=false). +// +// On a successful reconnect, the outer loop calls +// repositionForReconnect to reset wire state and replay window +// before this method runs again. +func (l *qwpSfSendLoop) runOneConnection() error { + connCtx, connCancel := context.WithCancel(l.ctx) + defer connCancel() + + type loopErr struct{ err error } + errCh := make(chan loopErr, 2) + + var inner sync.WaitGroup + inner.Add(2) + go func() { + defer inner.Done() + err := l.senderLoop(connCtx) + errCh <- loopErr{err} + connCancel() + }() + go func() { + defer inner.Done() + err := l.receiverLoop(connCtx) + errCh <- loopErr{err} + connCancel() + }() + inner.Wait() + close(errCh) + var first error + for e := range errCh { + if e.err != nil && first == nil { + first = e.err + } + } + return first +} + +// senderLoop walks the engine's frames and sends each as one +// WebSocket binary message. Returns ctx.Err() on shutdown or the +// transport's send error on wire failure. +func (l *qwpSfSendLoop) senderLoop(ctx context.Context) error { + // A single reusable timer backs the fallback poll, re-armed each + // idle iteration. The doorbell (wakeup) drives the common case, so + // the timer only bounds how long a missed wakeup can stall a ready + // frame; it never gates steady-state latency. + timer := time.NewTimer(l.parkInterval) + defer timer.Stop() + for { + if err := ctx.Err(); err != nil { + return nil // clean shutdown + } + if !l.running.Load() { + return nil + } + didWork, err := l.trySendOne(ctx) + if err != nil { + return err + } + if !didWork { + // Drain a possibly-fired timer before Reset (same + // dance as qwpSfSegmentManager.workerLoop). Wake on + // shutdown, a producer doorbell, or the fallback tick. + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + timer.Reset(l.parkInterval) + select { + case <-ctx.Done(): + return nil + case <-l.wakeup: + case <-timer.C: + } + } + } +} + +// trySendOne sends at most one frame. Returns (true, nil) if it +// sent a frame, (false, nil) if there's nothing ready, or (false, +// err) on wire failure. +// +// Bounded: at most one frame per call so the receiver goroutine +// gets scheduling fairness. +func (l *qwpSfSendLoop) trySendOne(ctx context.Context) (bool, error) { + if l.sendingSegment == nil { + l.sendingSegment = l.engine.engineActiveSegment() + if l.sendingSegment == nil { + return false, nil + } + l.sendOffset = qwpSfHeaderSize + } + pub := l.sendingSegment.publishedOffset() + if l.sendOffset >= pub { + // Nothing more in the current segment. If it's a sealed + // segment (no longer the live active), advance to the next. + if l.sendingSegment != l.engine.engineActiveSegment() { + next := l.advanceSegment() + if next != l.sendingSegment { + l.sendingSegment = next + l.sendOffset = qwpSfHeaderSize + return true, nil + } + } + return false, nil + } + if l.sendOffset+qwpSfFrameHeaderSize > pub { + return false, nil + } + base := l.sendingSegment.address() + payloadLen := int64(int32(binary.LittleEndian.Uint32(base[l.sendOffset+4 : l.sendOffset+8]))) + if payloadLen < 0 { + return false, fmt.Errorf("qwp/sf: negative payloadLen at offset %d in segment baseSeq=%d", + l.sendOffset, l.sendingSegment.segmentBaseSeq()) + } + frameEnd := l.sendOffset + qwpSfFrameHeaderSize + payloadLen + if frameEnd > pub { + return false, nil // payload not fully published yet + } + transport := l.transport.Load() + if transport == nil { + return false, errors.New("qwp/sf: transport gone mid-loop") + } + payload := base[l.sendOffset+qwpSfFrameHeaderSize : frameEnd] + // wireSeq/fsnSent for this frame derive from nextWireSeq, which + // the send goroutine advances here before the wire write. A wire + // failure thus leaves nextWireSeq advanced for a frame that never + // made it out; that is harmless because every reconnect path + // resets it via swapClient/positionCursorForStart. The receiver's + // ACK clamp keys off highestFullySent — advanced only after the + // write below returns — not off nextWireSeq, so a server's + // early/forged ACK cannot ride this pre-bump to cover the + // in-flight frame. + wireSeq := l.nextWireSeq.Load() + fsnSent := l.fsnAtZero.Load() + wireSeq + l.nextWireSeq.Store(wireSeq + 1) + if err := transport.sendMessage(ctx, payload); err != nil { + // Treat ctx-cancelled as a clean shutdown rather than a + // wire failure — runOneConnection will return nil and the + // outer loop sees running=false and exits. + if ctx.Err() != nil { + return false, nil + } + return false, err + } + // The frame is fully on the wire. Publish highestFullySent only + // now, after sendMessage returns: this is what lets the receiver + // safely let an ACK advance ackedFsn over this frame. Until this + // store the receiver clamps any ACK naming this sequence down to + // the previous frame, so the segment manager cannot trim (munmap) + // the segment while the payload slice we handed sendMessage still + // points into it. + l.highestFullySent.Store(wireSeq) + // An ACK for this frame may already have landed and been held back + // while highestFullySent still trailed it; reconcile now that the + // watermark is published so a quiescent last frame — whose ACK has + // no later ACK to re-drive it — does not strand its acknowledgement. + l.applyAckWatermark() + l.sendOffset = frameEnd + l.totalFramesSent.Add(1) + l.framesSentOnConn.Add(1) + if l.replayTargetFsn >= 0 { + l.totalFramesReplayed.Add(1) + if fsnSent >= l.replayTargetFsn { + l.replayTargetFsn = -1 + } + } + return true, nil +} + +// advanceSegment walks to the next segment when the current one is +// sealed and fully drained. Mirrors Java's CursorWebSocketSendLoop +// state machine: prefer the next sealed-by-baseSeq segment; fall +// back to the active if no later sealed exists; fall back to the +// oldest remaining sealed if our current was trimmed out from +// under us. +func (l *qwpSfSendLoop) advanceSegment() *qwpSfSegment { + current := l.sendingSegment + liveActive := l.engine.engineActiveSegment() + if current == liveActive { + return current + } + next := l.engine.engineNextSealedAfter(current) + if next != nil { + return next + } + first := l.engine.engineFirstSealed() + if first != nil && first.segmentBaseSeq() > current.segmentBaseSeq() { + return first + } + return liveActive +} + +// applyAckWatermark advances the engine's ACK cursor to the lesser of +// the server's cumulative ACK sequence (serverAckedSeq, owned by the +// receiver) and the highest wire sequence whose send has fully returned +// (highestFullySent, owned by the sender), mapped through fsnAtZero. +// Both inputs are monotonic within a connection but written on separate +// goroutines, so it is called from both: by the receiver as each ACK +// lands, and by the sender right after it publishes a fresh +// highestFullySent. Whichever store completes last observes both values +// and drives the advance — closing the race where the ACK for the only +// in-flight frame arrives before the send completes and would otherwise +// be stranded (no later ACK to re-drive it, leaving engineAckedFsn +// below publishedFsn forever). +// +// The min is the munmap-safety clamp: capping at highestFullySent keeps +// ackedFsn off any frame the send goroutine is still reading out of the +// mmap'd segment, and off a frame a wire failure dropped before +// delivery — so a non-compliant server's early or forged ACK cannot +// move the watermark past what we have actually put on the wire. +// engineAcknowledge is monotonic, idempotent, and clamps to +// publishedFsn internally, so the concurrent calls from the two +// goroutines are safe and a stale-lower min is ignored. +func (l *qwpSfSendLoop) applyAckWatermark() { + sent := l.highestFullySent.Load() + acked := l.serverAckedSeq.Load() + if sent < 0 || acked < 0 { + return + } + if acked > sent { + acked = sent + } + l.engine.engineAcknowledge(l.fsnAtZero.Load() + acked) +} + +// receiverLoop reads ACKs from the WebSocket and routes them to +// the engine. Returns ctx.Err() on shutdown or the transport's +// read error on wire failure. +func (l *qwpSfSendLoop) receiverLoop(ctx context.Context) error { + for { + if err := ctx.Err(); err != nil { + return nil + } + if !l.running.Load() { + return nil + } + transport := l.transport.Load() + if transport == nil { + return errors.New("qwp/sf: transport gone mid-loop") + } + status, data, err := transport.readAck(ctx) + if err != nil { + if ctx.Err() != nil { + return nil + } + return err + } + if status == QwpStatusDurableAck { + // Per-table fsync confirmation. Cursor SF doesn't + // currently surface durable-ack progress to the + // producer, but receiving one is not an error — match + // the Java client and silently ignore. + continue + } + seq := parseAckSequence(data) + if status != QwpStatusOK { + // Application-layer rejection by the server. Classify the + // status byte, resolve the policy, surface a typed + // SenderError. Halt latches and exits the receiver loop; + // DropAndContinue advances ackedFsn past the rejected + // span and keeps draining (the bytes on disk are the + // bytes the server rejected — reconnect/replay cannot + // fix them; only dropping moves us past them). + // + // Sanity clamp: do not trust a rejection wireSeq beyond the + // frames whose sendMessage has fully returned. Without this + // clamp the DROP path can advance ackedFsn over an in-flight + // or never-delivered frame, which makes the segment manager + // trim (munmap) a segment the I/O thread is still reading. + // Mirrors handleServerRejection in the Java client. The + // clamp only feeds the FSN math; the reported MessageSequence + // is the raw server-sent seq so it round-trips verbatim + // against server-side logs. + highestSent := l.highestFullySent.Load() + _, _, msg := parseAckErrorPayload(data) + cat := qwpSfClassify(status) + pol := l.policyResolver.Load().resolve(cat) + if highestSent < 0 { + // Pre-send rejection: no frame has finished sending on + // this connection yet, so the server emitted the error + // frame before it could have received one of ours + // (typical right after a fresh swapClient — auth failure, + // server-initiated halt, etc.). The server-named + // wireSeq does not correspond to any frame we delivered, + // so clamping to 0 and acknowledging fsnAtZero would + // silently advance ackedFsn past a real unsent batch + // (fsnAtZero == ackedFsn + 1 right after a swap). + // Attribute the failure to the unacked + // [ackedFsn+1, publishedFsn] window — the same span + // the protocol-violation close path uses — and skip + // the watermark advance entirely; there is nothing + // on this connection to drop. Still surface the + // typed error so HALT latches and the handler fires. + // Mirrors handlePreSendRejection in the Java client. + from := l.engine.engineAckedFsn() + 1 + to := l.engine.enginePublishedFsn() + if to < from { + to = from + } + se := &SenderError{ + Category: cat, + AppliedPolicy: pol, + ServerStatusByte: int(status), + ServerMessage: msg, + MessageSequence: seq, + FromFsn: from, + ToFsn: to, + DetectedAt: time.Now(), + } + l.totalServerErrors.Add(1) + if pol == PolicyHalt { + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return se + } + l.dispatcher.Load().offer(se) + continue + } + cappedSeq := seq + if cappedSeq > highestSent { + cappedSeq = highestSent + } + fsn := l.fsnAtZero.Load() + cappedSeq + se := &SenderError{ + Category: cat, + AppliedPolicy: pol, + ServerStatusByte: int(status), + ServerMessage: msg, + MessageSequence: seq, + FromFsn: fsn, + ToFsn: fsn, + DetectedAt: time.Now(), + } + l.totalServerErrors.Add(1) + if pol == PolicyHalt { + // Latch BEFORE dispatching: a handler that + // synchronously calls Flush / sendLoopCheckError + // must observe the typed terminal error. See + // qwp-cursor-error-api.md §120. + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return se + } + l.dispatcher.Load().offer(se) + // PolicyDropAndContinue: advance past the rejected span + // via the same engine entry the success branch uses. The + // segment manager will trim the now-acked range on its + // next maintenance pass. Bump totalAcks for parity with + // the success path so producer-visible counters reflect + // "the server has resolved this batch". + l.engine.engineAcknowledge(fsn) + l.totalAcks.Add(1) + continue + } + // Record the server's cumulative ACK sequence, then reconcile it + // against highestFullySent. applyAckWatermark caps the advance at + // the last fully-sent frame, so a malformed, early, or forged + // server response can never move ackedFsn over an in-flight frame + // (a trim would munmap a segment the I/O thread is still reading) + // nor over a frame a wire failure dropped before delivery. The + // matching call on the send side re-drives the same reconciliation + // so an ACK that arrives before its frame's send completes is not + // stranded when no later ACK follows it. + l.serverAckedSeq.Store(seq) + l.totalAcks.Add(1) + l.applyAckWatermark() + } +} + +// connectWithBackoff runs the failover.md §13.6 round-walk through +// qwpSfRunRoundWalk: each iteration demotes a just-failed host +// (previousIdx), picks the highest-priority unattempted endpoint, +// dials it, and classifies the outcome. Round-boundary sleep pays +// equal-jitter exponential backoff for transport rounds and a +// non-doubling InitialBackoff for role-reject rounds. Returns true +// on a successful bind (caller resumes the pump loop), false on +// terminal failure / budget exhaustion / shutdown. +// +// Shared between the reconnect path (phase="reconnect") and the +// async-initial-connect path (phase="initial connect"); the phase +// string only flavors the log/error message — control flow is +// identical. +func (l *qwpSfSendLoop) connectWithBackoff(initial error, phase string) bool { + if l.tracker == nil { + // Legacy single-host path (tests that didn't call + // sendLoopSetHostTracker). Synthesize an implicit 1-host + // tracker so the round-walk machinery handles every code + // path uniformly. + l.tracker = newQwpHostTracker(1, "", qwpTargetAny) + } + outageStart := time.Now() + l.outageStartUnixNano.Store(outageStart.UnixNano()) + l.reconnectAttempts.Store(0) + defer func() { + l.outageStartUnixNano.Store(0) + l.reconnectAttempts.Store(0) + }() + + // Snapshot the entering previousIdx and consume it for this + // connect cycle. The round-walk calls RecordMidStreamFailure + // internally; we reset our slot so a subsequent successful + // bind starts clean. + enteringPreviousIdx := l.previousIdx + l.previousIdx = -1 + + params := qwpSfRoundWalkParams{ + Factory: l.reconnectFactory, + Tracker: l.tracker, + MaxDuration: l.reconnectMaxDuration, + InitialBackoff: l.reconnectInitialBackoff, + MaxBackoff: l.reconnectMaxBackoff, + OnAttempt: func() { + l.reconnectAttempts.Add(1) + l.totalReconnectAttempts.Add(1) + }, + } + result := qwpSfRunRoundWalk(l.ctx, nil, params, enteringPreviousIdx) + + if result.Transport != nil { + // Successful bind. Remember the idx so a subsequent + // pump-exit can mid-stream-demote. + l.previousIdx = result.Idx + if swapErr := l.swapClient(result.Transport); swapErr != nil { + // Cursor positioning detected segment corruption — + // not retryable; reconnecting won't fix bad bytes + // in the on-disk segment. + l.recordFatal(swapErr) + return false + } + l.totalReconnects.Add(1) + return true + } + if result.Terminal != nil { + se := l.qwpSfBuildUpgradeFailureSE(result.Terminal) + l.totalServerErrors.Add(1) + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return false + } + if result.Cancelled != nil { + // ctx cancelled (close), or the round-walk reported a + // configuration error. The latter is rare and benign at + // shutdown; sample running to distinguish. + if !l.running.Load() { + return false + } + l.recordFatal(fmt.Errorf("%s aborted: %w", phase, result.Cancelled)) + return false + } + // Budget exhausted. Surface the underlying error chain to the + // dispatcher; reach into qwpSfBuildBudgetExhaustedSE so the + // SenderError carries the per-host snapshot. `initial` is the + // caller-supplied entry error (the mid-stream failure that + // triggered this connectWithBackoff); attach it as context. + reason := fmt.Sprintf("%s failed: %v (after entry error: %v)", + phase, result.Exhausted, initial) + se := l.qwpSfBuildBudgetExhaustedSE(reason) + l.totalServerErrors.Add(1) + l.recordFatalServerError(se) + l.dispatcher.Load().offer(se) + return false +} + +// swapClient replaces the active transport, realigns fsnAtZero to +// the next unacked FSN, restarts wire sequencing from 0 (clearing the +// fully-sent and server-ACK'd watermarks), and repositions the cursor +// so the next trySendOne call replays the first unacked frame. Returns +// a non-nil error if the cursor walk hits a corrupt frame header; see +// positionCursorAt. +// +// On success, fires onTransportSwap (if installed) with the new +// transport so the sender can refresh connection-derived state +// (the auto_flush_bytes clamp). The callback runs after the +// transport is published via atomic.Swap and after the cursor is +// repositioned, so any sender side effect (e.g. an updated +// effective threshold) is in place before the next trySendOne can +// publish a frame on the new connection. +func (l *qwpSfSendLoop) swapClient(newTransport *qwpTransport) error { + old := l.transport.Swap(newTransport) + if old != nil { + _ = old.close() + } + replayStart := l.engine.engineAckedFsn() + 1 + l.fsnAtZero.Store(replayStart) + l.nextWireSeq.Store(0) + l.highestFullySent.Store(-1) + l.serverAckedSeq.Store(-1) + l.framesSentOnConn.Store(0) + pubAtSwap := l.engine.enginePublishedFsn() + if pubAtSwap >= replayStart { + l.replayTargetFsn = pubAtSwap + } else { + l.replayTargetFsn = -1 + } + if err := l.positionCursorAt(replayStart); err != nil { + return err + } + if cb := l.onTransportSwap.Load(); cb != nil { + (*cb)(newTransport) + } + return nil +} + +// qwpSfIsTerminalUpgradeError reports whether err indicates any +// server-side WebSocket-upgrade reject that won't fix itself on +// retry — auth or protocol-mismatch alike. Kept for backwards +// compatibility; callers that need the auth-vs-protocol split +// should use qwpSfIsAuthFailure / qwpSfIsProtocolUpgradeFailure +// instead. +func qwpSfIsTerminalUpgradeError(err error) bool { + return qwpSfIsAuthFailure(err) || qwpSfIsProtocolUpgradeFailure(err) +} + +// qwpSfIsAuthFailure reports whether err indicates the server +// rejected the WebSocket upgrade with an auth-related HTTP status +// (401 unauthorized, 403 forbidden). These map to +// CategorySecurityError on the SenderError surface. +// +// Preferred path: the transport surfaces a typed *QwpUpgradeRejectError +// with the parsed status code. Falls back to substring matching on +// coder/websocket's free-form text so any code path that bypasses the +// typed reject (e.g. a future change in the dial library) still +// classifies cleanly. +func qwpSfIsAuthFailure(err error) bool { + if err == nil { + return false + } + var rej *QwpUpgradeRejectError + if errors.As(err, &rej) { + return rej.StatusCode == 401 || rej.StatusCode == 403 + } + msg := strings.ToLower(err.Error()) + for _, marker := range []string{ + "got 401", "got 403", + "unauthorized", "forbidden", + } { + if strings.Contains(msg, marker) { + return true + } + } + return false +} + +// qwpSfIsProtocolUpgradeFailure reports whether err indicates the +// server rejected the WebSocket upgrade with a protocol-related +// HTTP status (404 not found — wrong endpoint; 426 upgrade required +// — wrong protocol version). These map to +// CategoryProtocolViolation on the SenderError surface. +// +// The round-walk (failover.md §6) treats 404/426 as transient and +// routes them through RecordTransportError so a misconfig on one +// peer does not lock the client out of healthy siblings. This +// helper remains as a defensive fallback for the run()-level outer +// branch; typed `*QwpUpgradeRejectError`s originate from the factory +// and are consumed by the round-walk, so they do not reach this +// branch in normal operation. +func qwpSfIsProtocolUpgradeFailure(err error) bool { + if err == nil { + return false + } + var rej *QwpUpgradeRejectError + if errors.As(err, &rej) { + return rej.StatusCode == 404 || rej.StatusCode == 426 + } + msg := strings.ToLower(err.Error()) + for _, marker := range []string{ + "got 404", "got 426", + } { + if strings.Contains(msg, marker) { + return true + } + } + return false +} + +// qwpSfBuildUpgradeFailureSE constructs a typed *SenderError for an +// upgrade-failure terminal: SecurityError for auth (401/403), +// ProtocolViolation for protocol (404/426). Callers must have +// already determined the err is one of those two via the helpers +// above. +func (l *qwpSfSendLoop) qwpSfBuildUpgradeFailureSE(err error) *SenderError { + cat := CategoryProtocolViolation + if qwpSfIsAuthFailure(err) { + cat = CategorySecurityError + } + from := l.engine.engineAckedFsn() + 1 + to := l.engine.enginePublishedFsn() + if to < from { + to = from + } + return &SenderError{ + Category: cat, + AppliedPolicy: PolicyHalt, + ServerStatusByte: NoStatusByte, + ServerMessage: "ws-upgrade-failed: " + err.Error(), + MessageSequence: NoMessageSequence, + FromFsn: from, + ToFsn: to, + DetectedAt: time.Now(), + } +} + +// qwpSfBuildProtocolViolationSE constructs a typed *SenderError for +// a terminal WebSocket close frame (PROTOCOL_ERROR / +// UNSUPPORTED_DATA / etc.). The FSN span is the unacked window at +// close time. +func (l *qwpSfSendLoop) qwpSfBuildProtocolViolationSE(code websocket.StatusCode, reason string) *SenderError { + from := l.engine.engineAckedFsn() + 1 + to := l.engine.enginePublishedFsn() + if to < from { + to = from + } + return &SenderError{ + Category: CategoryProtocolViolation, + AppliedPolicy: PolicyHalt, + ServerStatusByte: NoStatusByte, + ServerMessage: fmt.Sprintf("ws-close[%d]: %s", code, reason), + MessageSequence: NoMessageSequence, + FromFsn: from, + ToFsn: to, + DetectedAt: time.Now(), + } +} + +// qwpSfBuildBudgetExhaustedSE constructs a typed *SenderError for +// reconnect-budget exhaustion. Treated as a ProtocolViolation since +// the wire is gone — the FSN span is the unacked window. +func (l *qwpSfSendLoop) qwpSfBuildBudgetExhaustedSE(reason string) *SenderError { + from := l.engine.engineAckedFsn() + 1 + to := l.engine.enginePublishedFsn() + if to < from { + to = from + } + return &SenderError{ + Category: CategoryProtocolViolation, + AppliedPolicy: PolicyHalt, + ServerStatusByte: NoStatusByte, + ServerMessage: reason, + MessageSequence: NoMessageSequence, + FromFsn: from, + ToFsn: to, + DetectedAt: time.Now(), + } +} + +// qwpSfConnectWithRetry runs the failover.md §13.6 round-walk on +// the calling goroutine for the InitialConnectSync path. The walk +// retries with backoff against every host in the tracker until +// success, terminal AuthError (401/403), or budget exhaustion. +// Returns the connected transport plus the bound endpoint index so +// the caller can seed qwpSfSendLoop's previousIdx. +// +// tracker may be nil — the function synthesizes a 1-host implicit +// tracker so legacy single-host tests don't need to construct one. +// In that mode the returned idx is always 0. +// +// factory is invoked once per dial attempt; idx is the host index +// PickNext returned. Single-host callers may ignore idx. +func qwpSfConnectWithRetry( + ctx context.Context, + factory qwpSfReconnectFactory, + tracker *qwpHostTracker, + maxDuration, initialBackoff, maxBackoff time.Duration, +) (*qwpTransport, int, error) { + if maxDuration <= 0 { + maxDuration = qwpSfDefaultReconnectMaxDuration + } + if initialBackoff <= 0 { + initialBackoff = qwpSfDefaultReconnectInitialBackoff + } + if maxBackoff <= 0 { + maxBackoff = qwpSfDefaultReconnectMaxBackoff + } + if tracker == nil { + tracker = newQwpHostTracker(1, "", qwpTargetAny) + } + params := qwpSfRoundWalkParams{ + Factory: factory, + Tracker: tracker, + MaxDuration: maxDuration, + InitialBackoff: initialBackoff, + MaxBackoff: maxBackoff, + } + result := qwpSfRunRoundWalk(ctx, nil, params, -1) + if result.Transport != nil { + return result.Transport, result.Idx, nil + } + if result.Terminal != nil { + return nil, -1, fmt.Errorf("qwp/sf: WebSocket upgrade failed (won't retry): %w", result.Terminal) + } + if result.Cancelled != nil { + return nil, -1, result.Cancelled + } + if result.Exhausted == nil { + return nil, -1, errors.New("qwp/sf: round-walk returned no result") + } + return nil, -1, fmt.Errorf("qwp/sf: connect failed after %s / %d attempts: %w", + result.Exhausted.Elapsed, result.Exhausted.Attempts, result.Exhausted.LastError) +} diff --git a/qwp_sf_send_loop_test.go b/qwp_sf_send_loop_test.go new file mode 100644 index 00000000..70952524 --- /dev/null +++ b/qwp_sf_send_loop_test.go @@ -0,0 +1,1521 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// qwpSfTestServerOpts shapes the fake QWP server's behavior across +// the various reconnect / failure scenarios. +type qwpSfTestServerOpts struct { + // closeAfterFrames > 0 → close the connection after receiving N + // total frames (across reconnects). Used to exercise reconnect. + closeAfterFrames int + // rejectStatus, when non-zero, causes the server to respond + // with an error ACK carrying the given status. Used to exercise + // terminal-server-error. + rejectStatus QwpStatusCode + // upgradeStatus, when non-zero, causes the server to respond + // with that HTTP status code on the WebSocket upgrade request, + // rejecting the connection. Used to exercise auth-terminal. + upgradeStatus int + // silentDropAfterFrames > 0 → on EVERY connection, read N frames + // then close the WebSocket without sending any ACK. Models a + // server that accepts the upgrade but doesn't speak our wire + // protocol (version/config mismatch). This is what + // TestQwpSfSendLoopProtocolMismatchIsTerminal exercises. + silentDropAfterFrames int + // silentDropUntilConn, when > 0, scopes silentDropAfterFrames to + // connections with myConnID < silentDropUntilConn; connections at + // or beyond that id ACK normally. Models a *transient* ACK-less + // drop (a server restart or LB RST in the first-frame→first-ACK + // window) on the first connection(s), after which a healthy + // server resumes ACKing — the case the never-ACKed terminal + // heuristic must NOT mistake for an incompatible build. + silentDropUntilConn int + // silentAcks → read frames forever and never write any ACK + // back. Connection stays alive so the send loop does not go + // terminal; the producer's Close drain-wait is what surfaces + // the missing ACKs. Used by close-drain-timeout tests. + silentAcks bool + // rejectFirstNFrames > 0, in combination with rejectStatus, + // causes only the first N frames on the very first connection to + // receive an error ACK; everything after gets OK. Used to test + // DROP-and-continue semantics where the loop must keep draining + // past the rejected span. + rejectFirstNFrames int + // rejectFromConn > 0, in combination with rejectStatus, causes + // only connections with myConnID >= rejectFromConn to issue + // rejection ACKs. Connections below that threshold ACK OK + // normally. Used to model "server transient close → reconnect + // succeeds → next batch hits a rejection". + rejectFromConn int + // recordFrames → capture every frame's payload bytes, keyed by + // the connection that received it, into qwpSfTestServer. Lets a + // test reconstruct exactly which rows reached the server on each + // connection so it can assert gap-free, correctly-anchored replay + // after a mid-flush drop. Off by default so the other suites pay + // nothing for the bookkeeping. + recordFrames bool + // unsolicitedRejectAtConnect, when non-zero, makes the server + // emit a single error ACK (sequence 0) immediately on connection + // accept, BEFORE reading any frame from the client. Models a + // server that rejects the connection (auth halt, server-side + // circuit breaker, transient validation failure on + // reconnect) right after the WS upgrade — exercises the + // receiver's pre-send rejection guard. + unsolicitedRejectAtConnect QwpStatusCode + // forgedAckAtConnect, when non-nil, is written verbatim to the + // client as a single WebSocket binary message immediately on + // connect — before and without reading any frame — after which the + // handler falls through to its normal read loop (which blocks, + // since tests using this don't run the sender). Lets a test inject + // an early / forged ACK whose sequence names a frame the client has + // not finished sending, exercising the receiver's highestFullySent + // clamp. Build it with buildAckOK / buildAckError. + forgedAckAtConnect []byte +} + +// qwpSfTestServer is a fake QWP server for send-loop tests. It +// counts received frames across all connections (so tests can +// observe replays after reconnect). +type qwpSfTestServer struct { + *httptest.Server + totalFramesReceived atomic.Int64 + connCount atomic.Int64 + // kill is closed by tests that want to actively tear down every + // in-flight WS connection. httptest.Server.Close (and even + // CloseClientConnections) do not force-close hijacked + // connections, so handlers select on this channel to exit. + kill chan struct{} + // framesMu guards framesByConn. One handler goroutine runs per + // connection; in the reconnect tests only one is live at a time, + // but the lock keeps the recorder correct under the shared-handler + // pattern regardless. Populated only when opts.recordFrames is set. + framesMu sync.Mutex + framesByConn map[int64][]string +} + +// recordedFrames returns a deep copy of the per-connection payload +// log, keyed by the 1-based connection id (s.connCount order). Only +// non-empty when the server was built with recordFrames:true. +func (s *qwpSfTestServer) recordedFrames() map[int64][]string { + s.framesMu.Lock() + defer s.framesMu.Unlock() + out := make(map[int64][]string, len(s.framesByConn)) + for connID, payloads := range s.framesByConn { + cp := make([]string, len(payloads)) + copy(cp, payloads) + out[connID] = cp + } + return out +} + +func newQwpSfTestServer(t *testing.T, opts qwpSfTestServerOpts) *qwpSfTestServer { + t.Helper() + s := &qwpSfTestServer{kill: make(chan struct{})} + s.Server = httptest.NewServer(qwpSfTestServerHandler(t, s, opts)) + return s +} + +// newQwpSfTestServerOnListener builds a test server bound to the +// given pre-existing listener (rather than letting httptest pick a +// free port). Used by tests that need to reserve the port BEFORE +// creating the server — e.g. the async-initial-connect path where +// the producer must dial first and wait for the server to arrive on +// a known address. +// +// Takes ownership of the listener; the server's Close also closes +// the underlying listener. +func newQwpSfTestServerOnListener(t *testing.T, listener net.Listener) *qwpSfTestServer { + t.Helper() + s := &qwpSfTestServer{kill: make(chan struct{})} + s.Server = httptest.NewUnstartedServer(qwpSfTestServerHandler(t, s, qwpSfTestServerOpts{})) + _ = s.Server.Listener.Close() + s.Server.Listener = listener + s.Server.Start() + return s +} + +// qwpSfTestServerHandler returns the WebSocket handler used by the +// fake QWP test server, configured by `opts` and reporting stats on +// `s`. Extracted from newQwpSfTestServer so the same handler can be +// wired onto a pre-existing listener via newQwpSfTestServerOnListener. +func qwpSfTestServerHandler(t *testing.T, s *qwpSfTestServer, opts qwpSfTestServerOpts) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if opts.upgradeStatus != 0 { + w.WriteHeader(opts.upgradeStatus) + return + } + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + t.Logf("websocket accept error: %v", err) + return + } + defer conn.CloseNow() + // killWatcher: if the test fires s.kill, drop this WS. + // httptest.Server.Close/CloseClientConnections do not force- + // close hijacked WebSocket conns, so we need our own signal. + killCtx, cancelKill := context.WithCancel(context.Background()) + defer cancelKill() + go func() { + select { + case <-s.kill: + _ = conn.CloseNow() + case <-killCtx.Done(): + } + }() + myConnID := s.connCount.Add(1) + var localSeq int64 + var localFramesReceived int + if opts.unsolicitedRejectAtConnect != 0 { + // Send a single rejection ACK with sequence 0 BEFORE the + // client has had a chance to send anything. The receiver + // must observe highestSent < 0 and route through the + // pre-send rejection guard (no engineAcknowledge advance). + _ = conn.Write(context.Background(), websocket.MessageBinary, + buildAckError(opts.unsolicitedRejectAtConnect, 0, "pre-send-reject")) + } + if opts.forgedAckAtConnect != nil { + // Inject a caller-built early / forged ACK before reading + // any frame, then fall through to the read loop below (which + // blocks until the client tears the connection down). + _ = conn.Write(context.Background(), websocket.MessageBinary, + opts.forgedAckAtConnect) + } + for { + _, data, err := conn.Read(context.Background()) + if err != nil { + return + } + s.totalFramesReceived.Add(1) + localFramesReceived++ + if opts.recordFrames { + // Record BEFORE the closeAfterFrames drop below: a + // frame the server read but never ACKed (its ACK lost + // to the drop) still "reached the server" — that is + // exactly the persisted-but-unacked row the real + // server dedups when replay re-sends it. + s.framesMu.Lock() + if s.framesByConn == nil { + s.framesByConn = make(map[int64][]string) + } + s.framesByConn[myConnID] = append(s.framesByConn[myConnID], string(data)) + s.framesMu.Unlock() + } + // closeAfterFrames triggers ONLY on the first connection: + // we accept N frames and then drop. Subsequent reconnects + // behave normally so the loop can drain. + if opts.closeAfterFrames > 0 && + myConnID == 1 && + localFramesReceived >= opts.closeAfterFrames { + return + } + // silentDropAfterFrames applies to EVERY connection: read N + // frames then close without ACKing. Models a server that + // accepts the upgrade but doesn't understand our wire + // protocol — reconnects would just hammer it. When + // silentDropUntilConn is set the drop is scoped to the + // first (silentDropUntilConn-1) connections, so later + // reconnects ACK normally — a transient drop, not an + // incompatible build. + silentDropActive := opts.silentDropAfterFrames > 0 + if silentDropActive && opts.silentDropUntilConn > 0 { + silentDropActive = myConnID < int64(opts.silentDropUntilConn) + } + if silentDropActive && + localFramesReceived >= opts.silentDropAfterFrames { + return + } + if opts.silentAcks { + continue + } + if opts.rejectStatus != 0 { + // Default behavior with no gating: reject every frame. + rejectThisFrame := true + // rejectFirstNFrames gates rejection to the first N + // frames of conn 1 (and silently passes on conn 2+). + if opts.rejectFirstNFrames > 0 { + if myConnID == 1 { + rejectThisFrame = localFramesReceived <= opts.rejectFirstNFrames + } else { + rejectThisFrame = false + } + } + // rejectFromConn additively re-enables rejection on + // conn N+. Combined with rejectFirstNFrames, this models + // "reject some on conn 1, reject all on conn ≥ N". + if opts.rejectFromConn > 0 { + if myConnID >= int64(opts.rejectFromConn) { + rejectThisFrame = true + } else if opts.rejectFirstNFrames == 0 { + rejectThisFrame = false + } + } + if rejectThisFrame { + _ = conn.Write(context.Background(), websocket.MessageBinary, + buildAckError(opts.rejectStatus, localSeq, "rejected")) + localSeq++ + continue + } + } + _ = conn.Write(context.Background(), websocket.MessageBinary, + buildAckOK(localSeq)) + localSeq++ + } + }) +} + +// qwpSfDialFor builds a transport connected to the given +// httptest server. Used as the qwpSfReconnectFactory for tests. +// The idx parameter is accepted for signature symmetry with +// multi-host factories and ignored — tests use a single host. +func qwpSfDialFor(server *qwpSfTestServer) qwpSfReconnectFactory { + return func(ctx context.Context, _ int) (*qwpTransport, error) { + var t qwpTransport + wsURL := "ws" + strings.TrimPrefix(server.URL, "http") + if err := t.connect(ctx, wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { + return nil, err + } + return &t, nil + } +} + +// qwpSfDialAt builds a transport connected to a fixed httptest URL. +func qwpSfDialAt(url string) qwpSfReconnectFactory { + return func(ctx context.Context, _ int) (*qwpTransport, error) { + var t qwpTransport + wsURL := "ws" + strings.TrimPrefix(url, "http") + if err := t.connect(ctx, wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { + return nil, err + } + return &t, nil + } +} + +func TestQwpSfSendLoopHappyPath(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Append 10 frames. + for i := 0; i < 10; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte(fmt.Sprintf("frame-%d", i))) + require.NoError(t, err) + } + + // Wait until ackedFsn catches up. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 9 + }, 2*time.Second, 1*time.Millisecond, "loop did not drain") + assert.Equal(t, int64(10), srv.totalFramesReceived.Load()) + assert.Equal(t, int64(10), loop.sendLoopTotalFramesSent()) + assert.Equal(t, int64(10), loop.sendLoopTotalAcks()) + assert.Equal(t, int64(0), loop.sendLoopTotalReconnects()) + assert.NoError(t, loop.sendLoopCheckError()) +} + +// positionCursorAt walks frame headers on the unrecovered I/O +// goroutine. A corrupt-but-positive payloadLen must be rejected with +// an error (which both callers route through recordFatal) rather than +// overrunning offset and panicking the next slice index — that panic +// would crash the whole process and bypass the typed-error path. +func TestQwpSfPositionCursorAtRejectsCorruptPayloadLen(t *testing.T) { + unusedFactory := func(context.Context, int) (*qwpTransport, error) { + return nil, errors.New("factory not used in this test") + } + + // Build an engine with a few real frames so a segment exists with + // baseSeq 0 and FSNs 0..2, then corrupt the first frame's + // payloadLen field in place and walk past it. + newCorruptLoop := func(t *testing.T, corruptBytes [4]byte) *qwpSfSendLoop { + t.Helper() + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + t.Cleanup(func() { _ = engine.engineClose() }) + + for i := 0; i < 3; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte("payl")) + require.NoError(t, err) + } + seg := engine.engineFindSegmentContaining(0) + require.NotNil(t, seg) + + // payloadLen of the first frame lives at + // [qwpSfHeaderSize+4 : qwpSfHeaderSize+8]. + addr := seg.address() + plOff := qwpSfHeaderSize + 4 + copy(addr[plOff:plOff+4], corruptBytes[:]) + + return qwpSfNewSendLoop(engine, nil, unusedFactory, + time.Millisecond, time.Second, time.Millisecond, time.Millisecond) + } + + t.Run("corrupt-but-positive payloadLen", func(t *testing.T) { + // 0x7FFFFFFF little-endian: positive int32, ~2 GiB stride. + loop := newCorruptLoop(t, [4]byte{0xFF, 0xFF, 0xFF, 0x7F}) + // targetFsn=2 forces a multi-frame walk; pre-fix this panicked + // on the second iteration's out-of-bounds header read. + err := loop.positionCursorAt(2) + require.Error(t, err) + assert.Contains(t, err.Error(), "corrupt segment") + }) + + t.Run("negative payloadLen", func(t *testing.T) { + // 0xFFFFFFFF little-endian: int32(-1). + loop := newCorruptLoop(t, [4]byte{0xFF, 0xFF, 0xFF, 0xFF}) + err := loop.positionCursorAt(2) + require.Error(t, err) + assert.Contains(t, err.Error(), "corrupt segment") + }) + + t.Run("valid walk is not a false positive", func(t *testing.T) { + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + t.Cleanup(func() { _ = engine.engineClose() }) + + for i := 0; i < 3; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte("payl")) + require.NoError(t, err) + } + loop := qwpSfNewSendLoop(engine, nil, unusedFactory, + time.Millisecond, time.Second, time.Millisecond, time.Millisecond) + + require.NoError(t, loop.positionCursorAt(2)) + // Two 4-byte-payload frames walked: HEADER + 2*(8+4). + assert.Equal(t, qwpSfHeaderSize+2*(qwpSfFrameHeaderSize+4), loop.sendOffset) + }) +} + +// TestQwpSfPositionCursorAtReconnectRace is a regression guard for the +// reconnect-under-load row-loss bug (Java PR #40). On reconnect, +// swapClient pins fsnAtZero to targetFsn = ackedFsn+1 and resets +// nextWireSeq to 0, then calls positionCursorAt(targetFsn). For wireSeq=0 +// to map back to targetFsn on the new connection, the cursor MUST land on +// the byte offset where targetFsn's frame begins. +// +// The producer runs concurrently with the I/O goroutine: positionCursorAt's +// first findSegmentContaining can miss targetFsn, and the buggy fallback +// then read the active segment's *post-publish* tip and parked one frame +// PAST targetFsn — silently dropping targetFsn and misnumbering every +// later frame by one, which the server trimmed on its next cumulative ACK +// while close() still reported clean delivery. +// +// We can't pin the exact interleaving, so we hammer positionCursorAt +// against a live producer and assert the invariant that must always hold +// post-fix: after positionCursorAt(targetFsn) the cursor sits exactly at +// targetFsn's frame offset, never past it. targetFsn is always at most one +// past publishedFsn (just like the reconnect anchor), so its offset is +// fixed whether the frame is already published, published mid-call, or not +// yet published. Pre-fix this trips whenever a publish lands inside the +// lookup→snapshot window; best run under -race. +func TestQwpSfPositionCursorAtReconnectRace(t *testing.T) { + const ( + frames = 4000 + payloadLen = 4 + ) + payload := []byte("payl") // payloadLen bytes + stride := int64(qwpSfFrameHeaderSize + payloadLen) + // One segment large enough to hold every frame, so baseSeq stays 0 and + // no rotation perturbs the offset arithmetic. + segSize := qwpSfHeaderSize + int64(frames)*stride + 1024 + + engine, err := qwpSfNewCursorEngine("", segSize, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + t.Cleanup(func() { _ = engine.engineClose() }) + + unusedFactory := func(context.Context, int) (*qwpTransport, error) { + return nil, errors.New("factory not used in this test") + } + loop := qwpSfNewSendLoop(engine, nil, unusedFactory, + time.Millisecond, time.Second, time.Millisecond, time.Millisecond) + + // Frame N begins at this offset; the segment never rotates so it is + // stable for the whole run. + expectedOffset := func(fsn int64) int64 { return qwpSfHeaderSize + fsn*stride } + + // Stop + drain the producer before the engine is torn down. t.Cleanup + // runs LIFO, so this (registered after the engine-close cleanup above) + // runs first: on a require failure the test goroutine unwinds via + // Goexit, and this guarantees the producer is no longer appending when + // engineClose runs — otherwise it would nil-deref on the closed segment + // and mask the real assertion message with a panic. + var prodErr atomic.Value // holds error + stop := make(chan struct{}) + done := make(chan struct{}) + t.Cleanup(func() { close(stop); <-done }) + go func() { + defer close(done) + for i := 0; i < frames; i++ { + select { + case <-stop: + return + default: + } + if _, err := engine.engineAppendBlocking(context.Background(), payload); err != nil { + prodErr.Store(err) + return + } + } + }() + +positioning: + for { + select { + case <-done: + break positioning + default: + } + // At most one past what's published right now — either already + // published, published during the call (the race window), or the + // very next frame. This mirrors the reconnect anchor + // targetFsn = ackedFsn+1. + targetFsn := engine.enginePublishedFsn() + 1 + if targetFsn >= int64(frames) { + continue + } + require.NoError(t, loop.positionCursorAt(targetFsn)) + require.Equalf(t, expectedOffset(targetFsn), loop.sendOffset, + "positionCursorAt(%d) parked %d stride(s) past the frame — a reconnect here would drop it", + targetFsn, (loop.sendOffset-expectedOffset(targetFsn))/stride) + } + if e := prodErr.Load(); e != nil { + t.Fatalf("producer failed: %v", e.(error)) + } + + // Producer done: every frame is published. A deterministic position on + // the last frame must land exactly on it, in the original baseSeq-0 + // segment (no rotation happened). + require.NoError(t, loop.positionCursorAt(int64(frames-1))) + require.Equal(t, expectedOffset(int64(frames-1)), loop.sendOffset) + require.Equal(t, int64(0), loop.sendingSegment.segmentBaseSeq(), + "single segment expected; a rotation would invalidate the offset math above") +} + +func TestQwpSfSendLoopReconnectAfterServerClose(t *testing.T) { + // Run over both engine backings so disk-backed reconnect+replay — + // otherwise exercised only by the jar-gated fuzz workflow — is + // covered here too. "" selects a memory-backed engine; a TempDir + // selects disk-backed segments under that slot directory. + t.Run("memory", func(t *testing.T) { testQwpSfSendLoopReconnectAfterServerClose(t, "") }) + t.Run("disk", func(t *testing.T) { testQwpSfSendLoopReconnectAfterServerClose(t, t.TempDir()) }) +} + +func testQwpSfSendLoopReconnectAfterServerClose(t *testing.T, sfDir string) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{closeAfterFrames: 5}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine(sfDir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Warm-up frame: process one ACK deterministically before the + // burst so the run() silent-drop guard gates on lifetime + // totalAcks > 0 and treats the upcoming mid-burst drop as + // transient (reconnect) rather than as "server doesn't speak our + // protocol" (terminal halt). Without this, when closeAfterFrames + // fires, conn.CloseNow runs with the client's still-unread frames + // in the server's TCP RX buffer — Linux turns that close into a + // RST, which discards the 4 ACKs the server wrote before the + // trigger from the OS receive buffer on the client side. The + // receiver loop never sees them, lifetime totalAcks stays at 0, + // and run() latches the wrong terminal classification. + _, err = engine.engineAppendBlocking(context.Background(), []byte("warm-up")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "warm-up frame should ACK before the burst") + + for i := 0; i < 10; i++ { + _, err := engine.engineAppendBlocking(context.Background(), []byte(fmt.Sprintf("f-%d", i))) + require.NoError(t, err) + } + // All 11 frames (warm-up + 10 burst) should eventually be ACKed + // despite the server dropping conn 1 after reading 5 (warm-up + + // first 4 burst). closeAfterFrames is gated on myConnID == 1 so + // the reconnect lands on a fresh handler instance that ACKs + // every frame cleanly; the remaining burst frames hit the server + // on conn 2. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 10 + }, 5*time.Second, 1*time.Millisecond, "loop did not drain after reconnect") + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1)) + // fsnAtZero should have advanced past 0 after the swap. + assert.Greater(t, loop.sendLoopFsnAtZero(), int64(0)) +} + +// TestQwpSfSendLoopReplayIsGapFree pins the single most important +// correctness property of the cursor/SF architecture: after a +// mid-flush connection drop, the union of frames the server receives +// across all connections covers EVERY appended row with no gap, and +// the post-reconnect replay is FSN-contiguous, anchored exactly at +// the client's fsnAtZero (= engineAckedFsn()+1 at swap time). +// +// This is at-least-once on the wire by design — qwp-cursor-durability +// §"Stated assumptions": "Replay-after-reconnect produces +// duplicates", and the real server dedups by messageSequence; the +// recovery+dedup contract is explicitly out of this repo's scope. So +// the test deliberately *expects* duplicates and asserts none of the +// things server-side dedup handles. It fails only on a replay GAP +// (permanent data loss) or a MISALIGNED anchor (the client stamping a +// messageSequence the server's dedup can't key on) — the two failure +// modes dedup cannot paper over, and the two that are this client's +// job to guarantee. +// +// Why the scenario has teeth: closeAfterFrames:5 over (warm-up + 10 +// burst) appends means the server reads warm-up + f-0..f-3 on conn 1 +// and never sees f-4..f-9 on conn 1 at all. The ONLY path by which +// f-4..f-9 ever reach the server is the post-reconnect replay, so a +// cursor-repositioning bug that skips any of them is permanent loss +// that neither the global frame counter nor an `ackedFsn >= n` +// liveness check can detect (both are driven off the same client- +// side FSN math the bug would have corrupted). The contiguity+anchor +// assertion additionally catches a skip of warm-up..f-3 (those +// frames the server DID see pre-drop, so the union alone would mask +// their loss). +func TestQwpSfSendLoopReplayIsGapFree(t *testing.T) { + // Run over both engine backings so disk-backed gap-free replay — + // otherwise exercised only by the jar-gated fuzz workflow — is + // covered here too. "" selects a memory-backed engine; a TempDir + // selects disk-backed segments under that slot directory. + t.Run("memory", func(t *testing.T) { testQwpSfSendLoopReplayIsGapFree(t, "") }) + t.Run("disk", func(t *testing.T) { testQwpSfSendLoopReplayIsGapFree(t, t.TempDir()) }) +} + +func testQwpSfSendLoopReplayIsGapFree(t *testing.T, sfDir string) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + closeAfterFrames: 5, + recordFrames: true, + }) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine(sfDir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Warm-up frame: process one ACK deterministically before the + // burst so the run() silent-drop guard gates on lifetime + // totalAcks > 0 and treats the upcoming mid-burst drop as + // transient. See the equivalent block in + // TestQwpSfSendLoopReconnectAfterServerClose for the + // RST-loses-in-flight-ACKs race that this dodges. + _, err = engine.engineAppendBlocking(context.Background(), []byte("warm-up")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "warm-up frame should ACK before the burst") + + const n = 10 + for i := 0; i < n; i++ { + _, err := engine.engineAppendBlocking( + context.Background(), []byte(fmt.Sprintf("f-%d", i))) + require.NoError(t, err) + } + // FSNs: warm-up=0, f-0..f-9 = 1..10. All-acked = ackedFsn >= n. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= int64(n) + }, 5*time.Second, 1*time.Millisecond, + "loop did not drain every frame after reconnect") + require.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1), + "the mid-flush drop must have forced at least one reconnect") + + frames := srv.recordedFrames() + require.Len(t, frames, 2, + "expected exactly two connections (one drop -> one reconnect)") + conn1, conn2 := frames[1], frames[2] + + // conn 1: the server reads exactly the warm-up + first four + // burst frames (5 total — the closeAfterFrames trigger), in + // order, then drops. This is independent of how many ACKs it + // managed to write before dropping, so this part is race-free. + require.Equal(t, []string{"warm-up", "f-0", "f-1", "f-2", "f-3"}, conn1, + "conn 1 must receive warm-up + first 4 burst frames before the drop") + + // conn 2: the replayed run. Its start depends on how many of + // conn 1's ACKs the receiver had processed before the drop + // surfaced — a benign race: fsnAtZero = engineAckedFsn()+1 at + // swap time, somewhere in [1,4] (warm-up's ACK was waited-on so + // fsnAtZero is at least 1; at most warm-up + f-0..f-2 were ACKed + // before the close, so fsnAtZero is at most 4). Whatever that + // anchor is, the replay MUST begin exactly there, be strictly + // contiguous (no gap, no reorder), and run through the final + // frame. fsnAtZero and the replayed bytes derive from the same + // ackedFsn snapshot, so this assertion is race-robust and is + // precisely the wire<->messageSequence alignment server-side + // dedup keys on. + require.NotEmpty(t, conn2, "reconnect must have replayed frames") + fsnAtZero := loop.sendLoopFsnAtZero() + require.GreaterOrEqual(t, fsnAtZero, int64(1)) + require.LessOrEqual(t, fsnAtZero, int64(4)) + for i, got := range conn2 { + // FSN fsnAtZero+i maps to f-(fsnAtZero-1+i): warm-up holds + // FSN 0, the burst occupies FSN 1..n. + want := fmt.Sprintf("f-%d", fsnAtZero-1+int64(i)) + require.Equalf(t, want, got, + "replayed frame %d not contiguous from the fsnAtZero anchor "+ + "(gap, reorder, or misaligned messageSequence)", i) + } + require.Equalf(t, fmt.Sprintf("f-%d", n-1), conn2[len(conn2)-1], + "replay must run through the final frame f-%d", n-1) + + // THE data-loss guard: every appended burst row reached the + // server at least once across the two connections. f-4..f-9 were + // never seen on conn 1, so only a correct replay puts them in + // this set. + seen := make(map[string]bool, n+1) + for _, payloads := range frames { + for _, p := range payloads { + seen[p] = true + } + } + for i := 0; i < n; i++ { + require.Truef(t, seen[fmt.Sprintf("f-%d", i)], + "row f-%d never reached the server — gap-free replay violated", i) + } + + // Duplicates are expected and correct (at-least-once + server + // dedup). Assert at least one actually occurred so a future change + // that silently stopped replaying can't pass this test trivially. + // Total appended = warm-up + n burst = n+1; anything past that is + // a replayed duplicate. + require.Greaterf(t, srv.totalFramesReceived.Load(), int64(n+1), + "replay must re-send >=1 already-received frame (the dup the "+ + "server dedups); got only %d total for %d rows", + srv.totalFramesReceived.Load(), n+1) +} + +func TestQwpSfSendLoopServerErrorIsTerminal(t *testing.T) { + // Use ParseError, which the spec defaults to Halt — SchemaMismatch + // is Drop and would no longer be terminal under the new policy + // resolver. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{rejectStatus: QwpStatusParseError}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("bad")) + require.NoError(t, err) + + // Loop must record a terminal error rather than entering reconnect. + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + gotErr := loop.sendLoopCheckError() + require.Error(t, gotErr) + var senderErr *SenderError + assert.True(t, errors.As(gotErr, &senderErr) || strings.Contains(gotErr.Error(), "rejected")) + // reconnects should be 0 — terminal status doesn't trigger + // reconnect (server isn't going to change its mind on retry). + assert.Equal(t, int64(0), loop.sendLoopTotalReconnects()) +} + +// TestQwpSfSendLoopPreSendHaltRejectionDoesNotFabricateFsn verifies +// that a HALT-category rejection ACK arriving BEFORE any frame has +// been sent on the current connection (highestSent < 0, e.g. right +// after a fresh swapClient) surfaces the typed SenderError but does +// NOT attribute it to a fabricated fsnAtZero. The reported span must +// be the unacked [ackedFsn+1, publishedFsn] window — the same span +// the protocol-violation close path uses — not the +// fsnAtZero+cappedSeq(=0) value the old code emitted. Mirrors the +// Java client's handlePreSendRejection guard. +func TestQwpSfSendLoopPreSendHaltRejectionDoesNotFabricateFsn(t *testing.T) { + // ParseError is HALT by default. The server fires the rejection + // immediately on connect, before we publish anything into the + // engine, so the receiver sees highestSent < 0. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + unsolicitedRejectAtConnect: QwpStatusParseError, + }) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + + gotErr := loop.sendLoopCheckError() + require.Error(t, gotErr) + var senderErr *SenderError + require.True(t, errors.As(gotErr, &senderErr), + "expected typed *SenderError, got %T: %v", gotErr, gotErr) + assert.Equal(t, CategoryParseError, senderErr.Category) + assert.Equal(t, PolicyHalt, senderErr.AppliedPolicy) + // Engine is empty: ackedFsn=-1, publishedFsn=-1 → + // FromFsn = 0, ToFsn = max(0, -1) = 0. + assert.Equal(t, int64(0), senderErr.FromFsn) + assert.Equal(t, int64(0), senderErr.ToFsn) + // The fabricated DROP would have advanced the engine watermark to + // fsn 0. Verify it did NOT. + assert.Equal(t, int64(-1), engine.engineAckedFsn(), + "pre-send rejection must not advance the engine's acked watermark") + assert.Equal(t, int64(1), loop.sendLoopTotalServerErrors()) + assert.Equal(t, int64(0), loop.sendLoopTotalReconnects(), + "HALT must not trigger reconnect") +} + +// TestQwpSfSendLoopPreSendDropRejectionDoesNotAdvanceWatermark +// verifies that a DROP_AND_CONTINUE rejection arriving before any +// frame has been sent on the current connection is dispatched but +// does NOT call engineAcknowledge — the old code would have advanced +// ackedFsn past the next-unsent batch (fsnAtZero == ackedFsn+1 right +// after a swap), which would let the segment manager trim sealed +// segments the I/O thread is about to replay. +func TestQwpSfSendLoopPreSendDropRejectionDoesNotAdvanceWatermark(t *testing.T) { + // SchemaMismatch is DROP_AND_CONTINUE by default — this is the + // dangerous case where the old code's fabricated + // engineAcknowledge(fsnAtZero) silently advanced the watermark. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + unsolicitedRejectAtConnect: QwpStatusSchemaMismatch, + }) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Wait for the receiver to process the unsolicited rejection. + require.Eventually(t, func() bool { + return loop.sendLoopTotalServerErrors() >= 1 + }, 2*time.Second, 1*time.Millisecond) + + // DROP must not latch — loop stays running, no terminal error. + assert.NoError(t, loop.sendLoopCheckError(), + "DROP policy must not latch a terminal error") + // Critical: the engine watermark must be unchanged. The old code + // would have called engineAcknowledge(fsnAtZero) = engineAcknowledge(0), + // advancing ackedFsn from -1 to 0. + assert.Equal(t, int64(-1), engine.engineAckedFsn(), + "pre-send DROP rejection must not advance the engine's acked watermark") + // And no spurious totalAcks bump either — the old code added one. + assert.Equal(t, int64(0), loop.sendLoopTotalAcks(), + "pre-send DROP rejection must not bump totalAcks") +} + +// TestQwpSfSendLoopSilentDropAfterFrameIsTerminal verifies that when +// the server accepts the WS upgrade but silently disconnects after a +// frame (without sending any ACK) on EVERY connection, the send loop +// classifies it as a server version/config mismatch and fails fast +// instead of entering a hot reconnect loop. Without this guard, every +// dial succeeds and the receiver reset its backoff on each attempt — +// burning thousands of ephemeral ports per second until +// reconnectMaxDuration (5 minutes default) expired. +// +// The guard fires only after qwpSfMaxSilentConnStrikes consecutive +// ACK-less connections — at least one full reconnect+replay cycle +// that still met silence — so this server, which drops on every +// connection, trips it. A single such drop reconnects instead; see +// TestQwpSfSendLoopSilentDropOnFirstConnReconnects. +func TestQwpSfSendLoopSilentDropAfterFrameIsTerminal(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{silentDropAfterFrames: 1}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("frame")) + require.NoError(t, err) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond, "loop should have failed fast") + + gotErr := loop.sendLoopCheckError() + require.Error(t, gotErr) + assert.Contains(t, gotErr.Error(), "without ACKing", + "error should explain the no-ACK detection") + + // The whole point: we must NOT hammer the server with thousands + // of reconnects. With qwpSfMaxSilentConnStrikes == 2 the loop + // gives up after exactly one reconnect+replay cycle that still + // met silence — i.e. one reconnect and two connections. + assert.LessOrEqual(t, loop.sendLoopTotalReconnects(), int64(1), + "expected at most one reconnect before terminal classification") + assert.LessOrEqual(t, srv.connCount.Load(), int64(2), + "server should have seen at most 2 connections") +} + +// TestQwpSfSendLoopSilentDropOnFirstConnReconnects verifies that a +// single ACK-less disconnect on the *first* connection — the +// signature of a routine server restart or LB RST landing in the +// window between a fresh sender's first frame and its first ACK — +// reconnects, replays the unacked frame, and recovers once the server +// ACKs. A repeated ACK-less pattern (>= qwpSfMaxSilentConnStrikes +// connections, i.e. at least one full reconnect+replay cycle that +// still met silence) is what trips the terminal classification; that +// case is TestQwpSfSendLoopSilentDropAfterFrameIsTerminal. +func TestQwpSfSendLoopSilentDropOnFirstConnReconnects(t *testing.T) { + // Conn 1 reads one frame then closes without ACKing (the + // transient restart/RST); conn 2+ ACK normally. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + silentDropAfterFrames: 1, + silentDropUntilConn: 2, + }) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 5*time.Second, 1*time.Millisecond, 10*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // One frame: conn 1 reads it and silently drops; the loop must + // reconnect to conn 2, replay it, and get the ACK. + _, err = engine.engineAppendBlocking(context.Background(), []byte("frame")) + require.NoError(t, err) + + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, 2*time.Second, 1*time.Millisecond, + "replayed frame should be ACK'd after reconnect to a healthy conn") + + // Crucially: the first ACK-less drop must NOT have latched a + // terminal incompatible-build SenderError. + if gotErr := loop.sendLoopCheckError(); gotErr != nil { + t.Fatalf("loop went terminal on a routine first-connection drop: %v", gotErr) + } + assert.Nil(t, loop.sendLoopLastTerminalServerError(), + "a single ACK-less first-connection drop must not be terminal") + // And we recovered via exactly the reconnect+replay path. + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnects(), int64(1), + "loop should have reconnected past the transient drop") + assert.GreaterOrEqual(t, loop.sendLoopTotalFramesReplayed(), int64(1), + "the unacked frame should have been replayed on the new connection") +} + +// TestQwpSfSendLoopSilentDropAfterPriorAckReconnects pins the +// regression for the silent-drop guard's false-positive failure +// mode: once any ACK has been observed across this sender's +// lifetime, a subsequent silent disconnect is a transient outage +// (LB drain emitting WS 1001 GoingAway, TCP RST surfacing as 1006, +// proxy reset, 1011/1012/1013 service restarts — none of which are +// flagged terminal by qwpSfIsTerminalCloseCode), not an +// incompatible-build mismatch. The loop must keep reconnecting +// rather than latch a terminal SenderError. +func TestQwpSfSendLoopSilentDropAfterPriorAckReconnects(t *testing.T) { + goodSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer goodSrv.Close() + // silentSrv stands in for the LB / proxy that accepts the WS + // upgrade but drops every frame without ACKing — what the old + // per-connection heuristic mistook for "incompatible build". + silentSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{silentDropAfterFrames: 1}) + defer silentSrv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(goodSrv)(context.Background(), 0) + require.NoError(t, err) + + // Reconnect factory points at silentSrv: after goodSrv goes + // away, every reconnect lands on the silent-drop server. + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialAt(silentSrv.URL), + 100*time.Microsecond, 30*time.Second, 1*time.Millisecond, 5*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Frame 0: goodSrv ACKs. After this, totalAcks >= 1 and the + // silent-drop guard's "never any ACK" precondition is gone. + _, err = engine.engineAppendBlocking(context.Background(), []byte("warm-up")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "warm-up frame should have been ACK'd by goodSrv") + + // Tear down goodSrv to force the loop into reconnect against silentSrv. + close(goodSrv.kill) + + // Enqueue a frame that silentSrv will read and silently drop, + // driving the silent-drop guard's reconnect cycle. Without + // further work the loop would just park on a quiet silentSrv + // connection forever and we'd observe no reconnects either way. + _, err = engine.engineAppendBlocking(context.Background(), []byte("post-kill")) + require.NoError(t, err) + + // Wait until the loop has accumulated several silent-drop + // reconnect cycles against silentSrv. Under the old heuristic + // the very first cycle would have latched a terminal + // "incompatible build" SenderError, capping connCount at 1. + require.Eventually(t, func() bool { + return silentSrv.connCount.Load() >= 3 + }, 2*time.Second, 1*time.Millisecond, + "loop should have reconnected to silentSrv multiple times") + + // The whole point: no terminal classification. + if gotErr := loop.sendLoopCheckError(); gotErr != nil { + t.Fatalf("loop unexpectedly went terminal after prior-ACK silent drop: %v", gotErr) + } + assert.Nil(t, loop.sendLoopLastTerminalServerError(), + "no terminal SenderError should be latched once totalAcks > 0") +} + +func TestQwpSfSendLoopUpgradeAuthFailureIsTerminal(t *testing.T) { + // First server ACKs at least one frame (so the post-disconnect + // classification is "had a real conversation, try to reconnect" + // rather than the no-ACK protocol-mismatch terminal path); then + // the WS conn is killed and the reconnect factory points at a + // *different* server that rejects the upgrade with 401, which is + // what this test actually exercises. + authSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{upgradeStatus: 401}) + defer authSrv.Close() + dataSrv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer dataSrv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(dataSrv)(context.Background(), 0) + require.NoError(t, err) + + // Reconnect factory dials the auth-rejecting server. + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialAt(authSrv.URL), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("hi")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "expected the warm-up frame to be ACKed by dataSrv") + + // Tear down the live WS so the loop falls into reconnect, where + // it'll hit authSrv and surface the 401. + close(dataSrv.kill) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + gotErr := loop.sendLoopCheckError() + require.Error(t, gotErr) + // Phase 4 routes 401 → SECURITY_ERROR / Halt SenderError. + var senderErr *SenderError + require.True(t, errors.As(gotErr, &senderErr), + "expected *SenderError, got %T: %v", gotErr, gotErr) + assert.Equal(t, CategorySecurityError, senderErr.Category) + assert.Equal(t, PolicyHalt, senderErr.AppliedPolicy) + assert.Contains(t, senderErr.ServerMessage, "401") +} + +func TestQwpSfSendLoopReconnectBudgetExhausted(t *testing.T) { + // Healthy server first — get a successful ACK on the live + // connection so the disconnect, when it comes, is NOT classified + // as "no ACKs ever, must be a protocol mismatch" by run(). Then + // take the server down so reconnects fail with connection-refused + // and the per-outage budget actually gets exercised. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, 200*time.Millisecond /* short cap */, 10*time.Millisecond, 50*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("warm-up")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond, "expected the warm-up frame to be ACKed") + + // Tear the live WS conn (kill channel) AND shut down the + // listener (Close) so reconnect attempts fail with connection- + // refused. CloseClientConnections / Close do not force-close + // hijacked WS conns, so the kill channel is required. + close(srv.kill) + srv.Close() + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 5*time.Second, 10*time.Millisecond) + gotErr := loop.sendLoopCheckError() + require.Error(t, gotErr) + assert.Contains(t, gotErr.Error(), "reconnect failed") + // Should have made multiple attempts before giving up. + assert.GreaterOrEqual(t, loop.sendLoopTotalReconnectAttempts(), int64(1)) +} + +func TestQwpSfSendLoopNilFactoryIsTerminalOnFailure(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{closeAfterFrames: 1}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + // Nil factory → wire failure is immediately terminal. + loop := qwpSfNewSendLoop(engine, transport, nil, + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + _, err = engine.engineAppendBlocking(context.Background(), []byte("data")) + require.NoError(t, err) + + require.Eventually(t, func() bool { + return loop.sendLoopCheckError() != nil + }, 2*time.Second, 1*time.Millisecond) + assert.Equal(t, int64(0), loop.sendLoopTotalReconnectAttempts()) +} + +// Spec §16: verifies the reconnect-status snapshot the loop exposes +// is non-empty while connectWithBackoff is iterating, so +// engineAppendBlocking can produce the diagnostic-rich +// "reconnecting: attempts=N, outage-elapsed=…" error. +func TestQwpSfSendLoopReconnectStatusSnapshot(t *testing.T) { + // Pre-state: never reconnecting. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + // Factory that always fails so the loop stays inside + // connectWithBackoff for the duration of the outage budget. We + // pass a still-good initial transport so the loop runs once, + // observes the close, and enters reconnect — which is the state + // we want to sample. + dialFails := atomic.Bool{} + factory := func(ctx context.Context, idx int) (*qwpTransport, error) { + if dialFails.Load() { + return nil, errors.New("dial: connection refused") + } + return qwpSfDialFor(srv)(ctx, idx) + } + + transport, err := factory(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, factory, + 100*time.Microsecond, 2*time.Second /* outage budget */, 10*time.Millisecond, 30*time.Millisecond) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // Pre-reconnect snapshot: not reconnecting. + reconnecting, attempts, _ := loop.sendLoopReconnectStatus() + assert.False(t, reconnecting) + assert.Equal(t, int64(0), attempts) + + _, err = engine.engineAppendBlocking(context.Background(), []byte("warm-up")) + require.NoError(t, err) + require.Eventually(t, func() bool { + return loop.sendLoopTotalAcks() >= 1 + }, time.Second, time.Millisecond) + + // Now flip the factory to fail and tear the live conn so the + // loop is forced into connectWithBackoff with a short backoff + // cap (30ms) — gives us many attempts inside the 2s budget. + dialFails.Store(true) + close(srv.kill) + + require.Eventually(t, func() bool { + r, a, start := loop.sendLoopReconnectStatus() + return r && a >= 1 && !start.IsZero() + }, 1500*time.Millisecond, 5*time.Millisecond, + "expected loop to enter reconnect with attempts ≥ 1 and a non-zero outage start") + + r, a, start := loop.sendLoopReconnectStatus() + require.True(t, r) + assert.GreaterOrEqual(t, a, int64(1)) + assert.WithinDuration(t, time.Now(), start, 2*time.Second) +} + +func TestQwpSfConnectWithRetrySucceedsEventually(t *testing.T) { + // Start with a port that nothing is listening on; flip to a + // real server after a few attempts. + var srv *qwpSfTestServer + var startedSrv atomic.Bool + var mu sync.Mutex + factoryAttempts := 0 + factory := func(ctx context.Context, idx int) (*qwpTransport, error) { + mu.Lock() + factoryAttempts++ + myAttempt := factoryAttempts + mu.Unlock() + if myAttempt < 3 { + // Closed-connection refused. + return nil, errors.New("dial: connection refused") + } + if startedSrv.CompareAndSwap(false, true) { + srv = newQwpSfTestServer(t, qwpSfTestServerOpts{}) + t.Cleanup(srv.Close) + } + return qwpSfDialFor(srv)(ctx, idx) + } + transport, _, err := qwpSfConnectWithRetry(context.Background(), factory, nil, + 2*time.Second, 5*time.Millisecond, 50*time.Millisecond) + require.NoError(t, err) + require.NotNil(t, transport) + _ = transport.close() + mu.Lock() + defer mu.Unlock() + assert.GreaterOrEqual(t, factoryAttempts, 3) +} + +func TestQwpSfConnectWithRetryTerminalUpgrade(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{upgradeStatus: 401}) + defer srv.Close() + + _, _, err := qwpSfConnectWithRetry(context.Background(), qwpSfDialFor(srv), nil, + 200*time.Millisecond, 5*time.Millisecond, 50*time.Millisecond) + require.Error(t, err) + assert.Contains(t, err.Error(), "WebSocket upgrade failed") +} + +func TestQwpSfConnectWithRetryBudgetExhausted(t *testing.T) { + factory := func(ctx context.Context, _ int) (*qwpTransport, error) { + return nil, errors.New("dial tcp: connection refused") + } + _, _, err := qwpSfConnectWithRetry(context.Background(), factory, nil, + 100*time.Millisecond, 5*time.Millisecond, 30*time.Millisecond) + require.Error(t, err) + assert.Contains(t, err.Error(), "connect failed") +} + +func TestQwpSfIsTerminalUpgradeError(t *testing.T) { + cases := []struct { + err error + want bool + label string + }{ + {errors.New("got 401 unauthorized"), true, "401"}, + {errors.New("got 403 forbidden"), true, "403"}, + {errors.New("got 426 upgrade required"), true, "426"}, + {errors.New("dial tcp: connection refused"), false, "transient"}, + {errors.New("websocket: bad handshake"), false, "transient"}, + {nil, false, "nil"}, + } + for _, c := range cases { + t.Run(c.label, func(t *testing.T) { + assert.Equal(t, c.want, qwpSfIsTerminalUpgradeError(c.err)) + }) + } +} + +// TestQwpSfRecordFatalServerErrorPopulatesBothFields asserts that +// recordFatalServerError sets both lastError and lastTerminalServerError, +// so producer-side errors.As unwrap and the typed accessor return the +// same payload. +func TestQwpSfRecordFatalServerErrorPopulatesBothFields(t *testing.T) { + l := &qwpSfSendLoop{} + se := &SenderError{ + Category: CategoryParseError, + AppliedPolicy: PolicyHalt, + ServerStatusByte: int(QwpStatusParseError), + ServerMessage: "bad column", + MessageSequence: 9, + FromFsn: 17, + ToFsn: 17, + DetectedAt: time.Now(), + } + l.recordFatalServerError(se) + + require.Equal(t, se, l.sendLoopLastTerminalServerError()) + + gotErr := l.sendLoopCheckError() + require.Error(t, gotErr) + var unwrapped *SenderError + require.True(t, errors.As(gotErr, &unwrapped)) + require.Equal(t, se, unwrapped) +} + +// TestQwpSfRecordFatalServerErrorIdempotent asserts that a second +// recordFatalServerError call does not overwrite the first — only the +// first failure wins, matching recordFatal's CAS semantics. +func TestQwpSfRecordFatalServerErrorIdempotent(t *testing.T) { + l := &qwpSfSendLoop{} + first := &SenderError{Category: CategoryWriteError, AppliedPolicy: PolicyHalt} + second := &SenderError{Category: CategorySchemaMismatch, AppliedPolicy: PolicyHalt} + l.recordFatalServerError(first) + l.recordFatalServerError(second) + require.Equal(t, first, l.sendLoopLastTerminalServerError()) +} + +// TestQwpSfRecordFatalServerErrorNilSafe asserts that passing nil is +// a no-op rather than a panic. +func TestQwpSfRecordFatalServerErrorNilSafe(t *testing.T) { + l := &qwpSfSendLoop{} + l.recordFatalServerError(nil) + require.Nil(t, l.sendLoopLastTerminalServerError()) + require.Nil(t, l.sendLoopCheckError()) +} + +// TestQwpSfSendLoopDropAndContinue verifies that a Drop-category +// rejection (SchemaMismatch) advances ackedFsn past the rejected +// frame instead of latching as terminal. The dispatcher receives the +// notification; sendLoopCheckError returns nil; subsequent frames +// continue draining. +func TestQwpSfSendLoopDropAndContinue(t *testing.T) { + // Run over both engine backings so disk-backed DROP-and-advance — + // otherwise exercised only by the jar-gated fuzz workflow — is + // covered here too. "" selects a memory-backed engine; a TempDir + // selects disk-backed segments under that slot directory. + t.Run("memory", func(t *testing.T) { testQwpSfSendLoopDropAndContinue(t, "") }) + t.Run("disk", func(t *testing.T) { testQwpSfSendLoopDropAndContinue(t, t.TempDir()) }) +} + +func testQwpSfSendLoopDropAndContinue(t *testing.T, sfDir string) { + // rejectStatus=SchemaMismatch (default Drop) for the very first + // frame only; subsequent frames get OK ACKs. We need the test + // server to support that mode — see opts.rejectFirstNFrames below. + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{ + rejectStatus: QwpStatusSchemaMismatch, + rejectFirstNFrames: 1, + }) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine(sfDir, 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + + // Capture dispatched errors to assert they fired. + var dispatched atomic.Int64 + loop.sendLoopSetErrorHandler(func(e *SenderError) { + if e.Category == CategorySchemaMismatch && e.AppliedPolicy == PolicyDropAndContinue { + dispatched.Add(1) + } + }, 8) + loop.sendLoopStart() + defer func() { _ = loop.sendLoopClose() }() + + // First frame is rejected → dropped. Frames 1 and 2 (0-indexed) are OK. + for i := 0; i < 3; i++ { + _, err := engine.engineAppendBlocking(context.Background(), + []byte(fmt.Sprintf("f%d", i))) + require.NoError(t, err) + } + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= 2 + }, 5*time.Second, 1*time.Millisecond, "ackedFsn did not advance past Drop") + + // No terminal error; reconnect did not trigger. + require.NoError(t, loop.sendLoopCheckError()) + require.Equal(t, int64(0), loop.sendLoopTotalReconnects()) + // Dispatcher saw exactly one Drop-category SenderError. + require.GreaterOrEqual(t, dispatched.Load(), int64(1)) + // Counter bumped on the Drop path. + require.GreaterOrEqual(t, loop.sendLoopTotalServerErrors(), int64(1)) +} + +// TestQwpSfSendLoopReceiverClampsForgedAckToFullySent is the +// lying-ACK regression guard. A non-compliant server ACKs a wire +// sequence whose sendMessage has not yet returned (an early or forged +// ACK for an in-flight frame). The receiver must clamp the watermark +// advance to highestFullySent — the last frame fully on the wire — so +// ackedFsn never covers a frame the send goroutine is still reading +// out of the mmap'd segment (a trim would munmap it mid-read: SIGSEGV) +// nor a frame that never went out (silent loss). nextWireSeq is one +// frame too permissive for this ceiling because it is bumped before +// the wire write. +// +// Layout for both cases: 4 frames published (FSN 0..3). The send +// goroutine has STARTED all four (nextWireSeq=4) but only frames 0..2 +// have FINISHED sending (highestFullySent=2); FSN 3 is mid-sendMessage. +// The server forges an ACK naming wire sequence 3. The clamp must hold +// ackedFsn at FSN 2, never FSN 3. With the clamp keyed off +// nextWireSeq-1 (=3) instead of highestFullySent (=2) the watermark +// jumps to FSN 3 and the test fails. +func TestQwpSfSendLoopReceiverClampsForgedAckToFullySent(t *testing.T) { + const ( + published = 4 // FSN 0..3 live in the engine + fsnAtZero = 0 // fresh connection: wireSeq 0 maps to FSN 0 + started = 4 // nextWireSeq: wireSeq 0..3 all begun + fullySent = 2 // highestFullySent: FSN 0..2 on the wire + forgedSeq = 3 // server ACKs the in-flight FSN 3 + wantAckedFsn = 2 // clamp ceiling, NOT forgedSeq (3) + ) + + // run drives receiverLoop in isolation against a server that + // greets the connection with forgedAck. The producer/sender + // goroutines never run, so the hand-pinned wire state (notably + // highestFullySent) stays put — FSN 3 stuck mid-sendMessage — while + // the receiver processes the single forged ACK. Returns the + // resulting ackedFsn. + run := func(t *testing.T, forgedAck []byte) int64 { + t.Helper() + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{forgedAckAtConnect: forgedAck}) + defer srv.Close() + + engine, err := qwpSfNewCursorEngine("", 4096, qwpSfUnlimitedTotalBytes, time.Second) + require.NoError(t, err) + defer func() { _ = engine.engineClose() }() + for i := 0; i < published; i++ { + _, err := engine.engineAppendBlocking(context.Background(), + []byte(fmt.Sprintf("f%d", i))) + require.NoError(t, err) + } + require.Equal(t, int64(published-1), engine.enginePublishedFsn()) + require.Equal(t, int64(-1), engine.engineAckedFsn()) + + transport, err := qwpSfDialFor(srv)(context.Background(), 0) + require.NoError(t, err) + + loop := qwpSfNewSendLoop(engine, transport, qwpSfDialFor(srv), + 100*time.Microsecond, time.Second, 10*time.Millisecond, 100*time.Millisecond) + // Quiet, non-blocking error sink for the drop-and-continue case. + loop.sendLoopSetErrorHandler(func(*SenderError) {}, 8) + + // Pin wire state as if the send goroutine had begun all four + // frames but only frames 0..2 finished sending. + loop.fsnAtZero.Store(fsnAtZero) + loop.nextWireSeq.Store(started) + loop.highestFullySent.Store(fullySent) + loop.running.Store(true) + + done := make(chan struct{}) + go func() { + defer close(done) + _ = loop.receiverLoop(loop.ctx) + }() + + require.Eventually(t, func() bool { + return engine.engineAckedFsn() != -1 + }, 2*time.Second, time.Millisecond, "receiver never processed the forged ACK") + got := engine.engineAckedFsn() + + _ = loop.sendLoopClose() // running=false, cancel ctx, close transport + dispatcher + <-done + return got + } + + t.Run("OK ACK", func(t *testing.T) { + got := run(t, buildAckOK(forgedSeq)) + assert.Equal(t, int64(wantAckedFsn), got, + "OK-path clamp must hold the watermark at the last fully-sent "+ + "frame (FSN 2); FSN 3 is still mid-sendMessage") + }) + + t.Run("error ACK (drop-and-continue)", func(t *testing.T) { + // SchemaMismatch resolves to DropAndContinue by default, so the + // rejection path advances ackedFsn via engineAcknowledge(fsn) — + // exercising the second clamp site. + got := run(t, buildAckError(QwpStatusSchemaMismatch, forgedSeq, "forged")) + assert.Equal(t, int64(wantAckedFsn), got, + "rejection-path clamp must hold the watermark at the last "+ + "fully-sent frame (FSN 2); FSN 3 is still mid-sendMessage") + }) +} diff --git a/qwp_split_flush_test.go b/qwp_split_flush_test.go new file mode 100644 index 00000000..e8ea22da --- /dev/null +++ b/qwp_split_flush_test.go @@ -0,0 +1,233 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestQwpSplitFlushSegmentCapSendsFitTablesDropsOversize is the core +// per-table-split regression test (review item M6): a multi-table batch +// whose combined frame overruns the per-segment cap must NOT destroy +// every table's rows. enqueueCursor falls back to a per-table split that +// flushes each table whose own frame fits and drops only the table that +// is individually over-cap. Mirrors Java +// QwpWebSocketSender.flushPendingRowsSplit. +func TestQwpSplitFlushSegmentCapSendsFitTablesDropsOversize(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{recordFrames: true}) + defer srv.Close() + + // Memory-mode cursor with a 4 KiB segment and no auto-flush, so the + // whole batch lands in one combined frame at the explicit Flush. + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + + // No server cap advertised: the 4 KiB segment is the binding limit. + require.Zero(t, s.serverMaxBatchSize.Load(), + "test precondition: no server cap, so the segment is the binding limit") + + ctx := context.Background() + + // fit_table: a few small rows; its own single-table frame fits the + // 4 KiB segment. + const fitRows = 3 + for i := 0; i < fitRows; i++ { + require.NoError(t, s.Table("fit_table").Int64Column("i", int64(i)).AtNow(ctx), + "fit row %d", i) + } + // big_table: ~20 KiB of column data — far past one 4 KiB segment even + // re-encoded on its own. + const bigRows = 100 + big := strings.Repeat("x", 200) + for i := 0; i < bigRows; i++ { + require.NoError(t, s.Table("big_table").StringColumn("s", big).AtNow(ctx), + "big row %d", i) + } + require.Equal(t, fitRows+bigRows, s.pendingRowCount) + + publishedBefore := engine.enginePublishedFsn() + + // The combined frame overruns the segment cap, so enqueueCursor + // splits per table: fit_table goes out; big_table is irreducible. + err := s.Flush(ctx) + require.Error(t, err, "the irreducible big_table must surface an error") + require.Contains(t, err.Error(), "big_table", "error must name the dropped table") + require.NotContains(t, err.Error(), "fit_table", + "the fit table must not be reported as dropped") + require.Contains(t, err.Error(), fmt.Sprintf("droppedRows=%d", bigRows), + "only big_table's rows are dropped, not the whole batch") + require.Contains(t, err.Error(), "cursor segment") + + // Whole batch resolved; nothing retained. + require.Zero(t, s.pendingRowCount) + require.Zero(t, s.pendingBytes) + + // Exactly one frame (fit_table) was published; big_table never was. + require.Equal(t, publishedBefore+1, engine.enginePublishedFsn(), + "exactly the fit_table frame should have been published") + + // Wait for that frame to reach the server, then assert the server saw + // fit_table and never saw big_table. Captured before the usability + // flush below so only the split's output is in the recording. + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= engine.enginePublishedFsn() + }, 2*time.Second, time.Millisecond) + + var payloads []string + for _, frames := range srv.recordedFrames() { + payloads = append(payloads, frames...) + } + require.Len(t, payloads, 1, "server should receive exactly the one fit_table frame") + require.Contains(t, payloads[0], "fit_table") + require.NotContains(t, payloads[0], "big_table") + + // Sender stays usable after the partial drop. + require.NoError(t, s.Table("fit_table").Int64Column("i", 99).AtNow(ctx)) + require.NoError(t, s.Flush(ctx)) + require.Zero(t, s.pendingRowCount) +} + +// TestQwpSplitFlushServerCapDropsOnlyOversizeTable is the server-cap +// analogue: when the server-advertised batch cap (not the segment cap) +// is the binding limit, the split still flushes the fit table and drops +// only the individually-over-cap one, reporting just that table's rows. +func TestQwpSplitFlushServerCapDropsOnlyOversizeTable(t *testing.T) { + const serverCap = 256 + srv := newQwpTestServerWithMaxBatch(t, serverCap) + defer srv.Close() + + addr := strings.TrimPrefix(srv.URL, "http://") + ls, err := LineSenderFromConf(context.Background(), + "ws::addr="+addr+";auto_flush=off;") + require.NoError(t, err) + defer ls.Close(context.Background()) + s := ls.(*qwpLineSender) + + ctx := context.Background() + + // The cap rides the upgrade response; wait until the transport-swap + // callback has mirrored it onto the sender. + require.Eventually(t, func() bool { + return s.serverMaxBatchSize.Load() == serverCap + }, 2*time.Second, time.Millisecond) + + // fit_one: a single tiny row — its own frame is well under 256 B. + require.NoError(t, s.Table("fit_one").Int64Column("i", 1).AtNow(ctx)) + // big_many: enough rows that its own frame exceeds 256 B. + const bigRows = 80 + for i := 0; i < bigRows; i++ { + require.NoError(t, s.Table("big_many").Int64Column("i", int64(i)).AtNow(ctx), + "big row %d", i) + } + require.Equal(t, bigRows+1, s.pendingRowCount) + + publishedBefore := s.cursorEngine.enginePublishedFsn() + + err = s.Flush(ctx) + require.Error(t, err) + require.Contains(t, err.Error(), "batch too large for server batch cap") + require.Contains(t, err.Error(), "big_many", "error must name the dropped table") + require.NotContains(t, err.Error(), "fit_one", + "the fit table must not be reported as dropped") + require.Contains(t, err.Error(), fmt.Sprintf("serverMaxBatchSize=%d", serverCap)) + require.Contains(t, err.Error(), fmt.Sprintf("droppedRows=%d", bigRows), + "only big_many's rows are dropped, not the fit table's") + + require.Zero(t, s.pendingRowCount) + require.Equal(t, publishedBefore+1, s.cursorEngine.enginePublishedFsn(), + "only the fit table should have been published") + + // Sender stays usable. + require.NoError(t, s.Table("fit_one").Int64Column("i", 2).AtNow(ctx)) + require.NoError(t, s.Flush(ctx)) +} + +// TestQwpSplitFlushAllFitTablesFlushAcrossFrames pins the all-reducible +// case: a combined frame over the segment cap purely by aggregation (no +// single table is over-cap) flushes every table, one frame per table, +// with no error and nothing dropped. +func TestQwpSplitFlushAllFitTablesFlushAcrossFrames(t *testing.T) { + srv := newQwpSfTestServer(t, qwpSfTestServerOpts{recordFrames: true}) + defer srv.Close() + + s, engine, _, cleanup := newCursorSenderForTest(t, srv, 0) + defer cleanup() + require.Zero(t, s.serverMaxBatchSize.Load()) + + ctx := context.Background() + + // Each table's own frame fits the 4 KiB segment comfortably, but the + // combined frame across all of them overruns it — forcing the split + // without any irreducible table. + const ( + tableCount = 8 + rowsPerTbl = 4 + strBytesLen = 180 + ) + filler := strings.Repeat("y", strBytesLen) + for tbl := 0; tbl < tableCount; tbl++ { + name := fmt.Sprintf("tbl_%d", tbl) + for r := 0; r < rowsPerTbl; r++ { + require.NoError(t, s.Table(name). + StringColumn("s", filler). + Int64Column("i", int64(r)). + AtNow(ctx), "%s row %d", name, r) + } + } + require.Equal(t, tableCount*rowsPerTbl, s.pendingRowCount) + + publishedBefore := engine.enginePublishedFsn() + + // Sanity: the combined frame really does overrun the segment cap, so + // this test exercises the split rather than the single-frame path. + tables, err := s.buildTableEncodeInfo() + require.NoError(t, err) + combined := s.encoder.encodeMultiTableWithDeltaDict(tables, s.globalSymbolList, -1, s.batchMaxSymbolId) + require.Greater(t, int64(len(combined)), s.maxFrameBytes, + "test setup: combined frame must overrun the segment cap") + + require.NoError(t, s.Flush(ctx), "an all-fit batch must flush fully with no error") + require.Zero(t, s.pendingRowCount) + + // One frame per table was published. + require.Equal(t, publishedBefore+int64(tableCount), engine.enginePublishedFsn(), + "each table should be published as its own frame") + + require.Eventually(t, func() bool { + return engine.engineAckedFsn() >= engine.enginePublishedFsn() + }, 2*time.Second, time.Millisecond) + + var payloads []string + for _, frames := range srv.recordedFrames() { + payloads = append(payloads, frames...) + } + require.Len(t, payloads, tableCount, "server should receive one frame per table") +} diff --git a/qwp_transport.go b/qwp_transport.go index caf34c81..6762f589 100644 --- a/qwp_transport.go +++ b/qwp_transport.go @@ -33,37 +33,75 @@ import ( "encoding/binary" "fmt" "io" + "math" "net" "net/http" + "strconv" "strings" + "sync" + "time" "github.com/coder/websocket" ) -// qwpWritePath is the WebSocket endpoint for QWP ingestion. -const qwpWritePath = "/write/v4" +// QWP WebSocket endpoint paths. Ingest and egress are separate endpoints; +// they share the version-negotiation headers but otherwise do not overlap. +const ( + qwpWritePath = "/write/v4" // ingest (QwpSender) + qwpReadPath = "/read/v1" // egress (QwpQueryClient) +) -// Version-negotiation HTTP headers (QWP spec §3). +// QWP HTTP headers exchanged on the WebSocket upgrade. The version +// negotiation triple is shared by ingest and egress. The accept-encoding +// / max-batch-rows / content-encoding triple is egress-only — ingest +// never sends or reads them. const ( - qwpHeaderMaxVersion = "X-QWP-Max-Version" - qwpHeaderClientId = "X-QWP-Client-Id" - qwpHeaderVersion = "X-QWP-Version" + qwpHeaderMaxVersion = "X-QWP-Max-Version" + qwpHeaderClientId = "X-QWP-Client-Id" + qwpHeaderVersion = "X-QWP-Version" + qwpHeaderAcceptEncoding = "X-QWP-Accept-Encoding" + qwpHeaderMaxBatchRows = "X-QWP-Max-Batch-Rows" + // qwpHeaderMaxBatchSize is the server-advertised hard cap on a + // single DATA_BATCH wire frame (bytes), echoed in the WebSocket + // upgrade response. Used to clamp the producer's + // auto_flush_bytes trigger down to 90% of this value so a + // soft-flush fires before the encoded batch can exceed the cap + // and trip ws-close[1009]. 0 / absent / unparseable means the + // server did not advertise a cap (older build) and the + // configured auto_flush_bytes is kept verbatim. Mirrors Java + // WebSocketClient.QWP_MAX_BATCH_SIZE_HEADER_NAME. + qwpHeaderMaxBatchSize = "X-QWP-Max-Batch-Size" ) // qwpClientId is sent in X-QWP-Client-Id during the upgrade handshake. // Follows the lang/version convention used by other QuestDB clients // (e.g. java/1.0.2). -const qwpClientId = "go/4.1.0" +const qwpClientId = "go/4.3.0" -// QWP ACK response sizes (spec §13). An OK ACK is exactly -// qwpAckOKSize bytes; an error ACK is exactly -// qwpAckErrorHeaderSize + msg_len bytes. +// QWP ACK response sizes (spec §13). All ACKs share a fixed header +// shape, but their tails vary: +// +// OK: [status(1)] [sequence(8)] [tableCount(2)] [entries…] +// DURABLE_ACK: [status(1)] [tableCount(2)] [entries…] +// Error: [status(1)] [sequence(8)] [msg_len(2)] [msg] +// +// Each table entry is [nameLen(2)] [name(nameLen)] [seqTxn(8)]. The +// minimum frame sizes below correspond to a payload with zero entries. const ( - qwpAckOKSize = 9 // status(1) + sequence(8) - qwpAckErrorHeaderSize = 11 // status(1) + sequence(8) + msg_len(2) + qwpAckOKMinSize = 11 // status(1) + sequence(8) + tableCount(2) + qwpAckDurableMinSize = 3 // status(1) + tableCount(2) + qwpAckErrorHeaderSize = 11 // status(1) + sequence(8) + msg_len(2) + qwpAckTableEntryHeader = 10 // nameLen(2) + seqTxn(8) + qwpAckSequenceOffset = 1 // status(1) + qwpAckOKTablesOffset = 9 // status(1) + sequence(8) + qwpAckDurableTablesOff = 1 // status(1) + qwpAckErrorMsgLenOffset = 9 // status(1) + sequence(8) ) -// qwpTransportOpts configures a WebSocket transport connection. +// qwpTransportOpts configures a WebSocket transport connection. The +// same struct drives both ingest (/write/v4) and egress (/read/v1) +// connections; acceptEncoding, maxBatchRows, maxVersion, and +// serverInfoTimeout are egress-only and inert at their zero values. type qwpTransportOpts struct { // tlsMode controls certificate verification. // When true, certificate verification is skipped. @@ -73,13 +111,67 @@ type qwpTransportOpts struct { // header, e.g. "Bearer " or "Basic ". // Empty string means no auth. authorization string + + // endpointPath is the HTTP path used for the WebSocket upgrade. + // Required: ingest callers set qwpWritePath, egress callers set + // qwpReadPath. Empty strings are rejected by connect() so mistakes + // surface loudly instead of dialing the wrong endpoint by default. + endpointPath string + + // acceptEncoding, when non-empty, is sent verbatim as the + // X-QWP-Accept-Encoding upgrade header. Egress-only. Matches the + // Java client's WebSocketClient.setQwpAcceptEncoding contract: + // the caller builds the value ("zstd;level=3,raw" etc.); the + // transport just forwards it. Empty string omits the header. + acceptEncoding string + + // maxBatchRows, when > 0, is sent as the X-QWP-Max-Batch-Rows + // upgrade header. Egress-only. Zero omits the header and lets + // the server use its own cap. + maxBatchRows int + + // maxVersion is the value advertised in the X-QWP-Max-Version + // handshake header. Zero means qwpVersion. QWP currently has a + // single protocol version, so both ingest and egress callers + // advertise qwpVersion; the header is retained as the negotiation + // mechanism for a future version bump. The transport accepts any + // echoed X-QWP-Version that is <= maxVersion. + maxVersion byte + + // serverInfoTimeout, when > 0, enables synchronous consumption of + // the SERVER_INFO frame after the upgrade. The egress endpoint + // (/read/v1) appends an unsolicited SERVER_INFO frame to the 101 + // response, so egress callers set this. The ingest endpoint + // (/write/v4) sends no SERVER_INFO and the client never expects + // one — it sends data right after the upgrade and the first inbound + // frame is an ACK — so ingest senders leave it zero. + serverInfoTimeout time.Duration + + // authTimeoutMs is the failover.md §1 per-host upper bound on the + // HTTP upgrade response read (i.e. the wait between writing the + // upgrade request and reading the response headers). It does NOT + // cover TCP connect (OS default), TLS handshake, or the post- + // upgrade SERVER_INFO frame read. Zero defers to the standard + // http.Transport default (effectively unbounded), matching the + // pre-failover-spec behavior; sanitizeQwpConf seeds 15000 for + // QWP-configured callers. + authTimeoutMs int } // qwpTransport wraps a WebSocket connection for sending QWP -// messages and receiving ACK responses. It is not safe for -// concurrent use; in sync mode the caller goroutine owns it, -// in async mode the I/O goroutine owns it. +// messages and receiving ACK responses. It is owned by the I/O +// goroutine(s) that drive it — the ingest send loop (qwpSfSendLoop), +// or the egress reader plus dispatcher — and is not safe for +// unrestricted concurrent use. type qwpTransport struct { + // conn is the live WebSocket. A successful connect() assigns it once + // and it is never mutated again for the life of the transport — + // close() shuts the connection down but leaves the field intact. That + // immutability is load-bearing: the egress reader and dispatcher read + // conn lock-free from their own goroutines, and a concurrent close() + // (e.g. a short-ctx Close that returns before those goroutines join) + // must not race them. A closed conn already errors every I/O, so + // nil-ing the field would buy nothing and only reintroduce that race. conn *websocket.Conn // recvBuf is a reusable buffer for reading ACK responses, @@ -89,6 +181,35 @@ type qwpTransport struct { // dumpWriter, when non-nil, records all outgoing TCP bytes // (HTTP upgrade + WebSocket frames). Set before connect(). dumpWriter io.Writer + + // negotiatedVersion is the QWP wire-protocol version selected by + // the server's X-QWP-Version response header. Populated by + // connect(); 0 before connect() has succeeded. Egress callers + // branch on this to decide whether to expect a SERVER_INFO frame. + negotiatedVersion byte + + // serverMaxBatchSize is the server-advertised hard cap on a + // single DATA_BATCH wire frame (bytes), parsed from the + // X-QWP-Max-Batch-Size response header during connect(). 0 + // means the server did not advertise a cap (header absent / + // unparseable / non-positive); callers must treat 0 as "no + // clamp". Read by the qwpLineSender's transport-swap callback + // to refresh its effective auto_flush_bytes threshold on every + // successful connect; a rolling upgrade can leave neighbouring + // endpoints with different caps. + serverMaxBatchSize int32 + + // serverInfo holds the SERVER_INFO frame consumed during connect() + // when opts.serverInfoTimeout is > 0. Nil on connections that did + // not opt into SERVER_INFO consumption (ingest senders). + serverInfo *QwpServerInfo + + // closeOnce guards close() so the underlying conn is shut down at + // most once and repeat calls return the same result. It writes no + // field the I/O goroutines read — conn stays immutable (see above), + // so close() never races the lock-free reader/dispatcher. + closeOnce sync.Once + closeErr error } // teeConn wraps a net.Conn, copying all Write calls to a side writer. @@ -103,67 +224,183 @@ func (c *teeConn) Write(p []byte) (int, error) { return c.Conn.Write(p) } +// asyncWritePipeConn wraps the client end of the dump-mode net.Pipe so +// Write queues the bytes and returns immediately, emulating a kernel +// socket's send buffer. A real socket buffers the client's send, so +// sendMessage returns — and the send loop stores highestFullySent — +// before the server reads the frame and replies. net.Pipe is +// synchronous: Write blocks until the peer reads, which lets the fake +// server's OK ACK reach the receiver before highestFullySent is stored. +// The receiver then clamps that ACK away (its highestFullySent < 0 +// guard) and never advances ackedFsn, so Close drains until timeout. +// Queuing the write restores the production ordering. A single pump +// goroutine drains the queue in FIFO order, preserving frame boundaries +// and byte order; Read and all net.Conn metadata pass through to the +// embedded pipe end. +type asyncWritePipeConn struct { + net.Conn + mu sync.Mutex + cond *sync.Cond + queued []byte + closed bool +} + +func newAsyncWritePipeConn(c net.Conn) *asyncWritePipeConn { + a := &asyncWritePipeConn{Conn: c} + a.cond = sync.NewCond(&a.mu) + go a.pump() + return a +} + +func (a *asyncWritePipeConn) Write(p []byte) (int, error) { + a.mu.Lock() + defer a.mu.Unlock() + if a.closed { + return 0, net.ErrClosed + } + a.queued = append(a.queued, p...) + a.cond.Signal() + return len(p), nil +} + +// pump owns the only Write to the embedded pipe, so queued chunks reach +// the fake server in order and never interleave. It exits once the conn +// is closed and the queue is drained. +func (a *asyncWritePipeConn) pump() { + for { + a.mu.Lock() + for len(a.queued) == 0 && !a.closed { + a.cond.Wait() + } + if len(a.queued) == 0 && a.closed { + a.mu.Unlock() + return + } + chunk := a.queued + a.queued = nil + a.mu.Unlock() + if _, err := a.Conn.Write(chunk); err != nil { + return + } + } +} + +func (a *asyncWritePipeConn) Close() error { + a.mu.Lock() + a.closed = true + a.cond.Signal() + a.mu.Unlock() + return a.Conn.Close() +} + // connect establishes a WebSocket connection to the QWP endpoint. -// The url should be a ws:// or wss:// URL without the path; the -// /write/v4 path is appended automatically. +// The url should be a ws:// or wss:// URL without the path; the path +// comes from opts.endpointPath, which is required. // // If t.dumpWriter is set, outgoing TCP bytes are recorded. When the // url is empty, an in-process pipe with a fake WebSocket acceptor // is used so the dump includes full HTTP upgrade + WebSocket framing // without requiring a real server. func (t *qwpTransport) connect(ctx context.Context, url string, opts qwpTransportOpts) error { - wsURL := url + qwpWritePath + if opts.endpointPath == "" { + return fmt.Errorf("qwp: endpointPath is required") + } + path := opts.endpointPath + wsURL := url + path + advertisedMax := opts.maxVersion + if advertisedMax == 0 { + advertisedMax = qwpVersion + } dialOpts := &websocket.DialOptions{ HTTPHeader: http.Header{ - qwpHeaderMaxVersion: []string{fmt.Sprintf("%d", qwpVersion)}, + qwpHeaderMaxVersion: []string{fmt.Sprintf("%d", advertisedMax)}, qwpHeaderClientId: []string{qwpClientId}, }, } if opts.authorization != "" { dialOpts.HTTPHeader.Set("Authorization", opts.authorization) } + if opts.acceptEncoding != "" { + dialOpts.HTTPHeader.Set(qwpHeaderAcceptEncoding, opts.acceptEncoding) + } + if opts.maxBatchRows > 0 { + dialOpts.HTTPHeader.Set(qwpHeaderMaxBatchRows, fmt.Sprintf("%d", opts.maxBatchRows)) + } + + // Build the http.Transport so we can install ResponseHeaderTimeout + // per failover.md §1 (auth_timeout_ms bounds the upgrade response + // read). The same Transport carries TLS config for wss:// and the + // pipe-DialContext for dump mode. + // + // DisableKeepAlives keeps this one-shot transport from pooling. It is + // built fresh per connect() and discarded after, so there is no reuse + // to gain — and on a non-101 upgrade response (421 role-reject, 503 + // proxy, ...) coder/websocket reads the body to EOF and closes it, + // which would otherwise return the keep-alive TCP conn to this + // transport's idle pool. Nothing reuses the abandoned transport or + // calls CloseIdleConnections on it, so the parked conn plus its + // persistConn read/write goroutines would leak — and role-rejects are + // steady-state in a failover topology, so the leak accumulates. A + // successful 101 hijacks the conn out of pool management, so the flag + // never affects the live WebSocket. + httpTransport := &http.Transport{ + DisableKeepAlives: true, + } + if opts.authTimeoutMs > 0 { + httpTransport.ResponseHeaderTimeout = time.Duration(opts.authTimeoutMs) * time.Millisecond + } if t.dumpWriter != nil { - // Dump mode: use an in-process pipe with a fake server. + // Dump mode: use an in-process pipe with a fake server. The + // client write end is buffered (asyncWritePipeConn) so it + // behaves like a real socket — without it the synchronous pipe + // lets the fake server's ACK race the send loop's bookkeeping. clientConn, serverConn := net.Pipe() go qwpFakeServer(serverConn) - wrapped := &teeConn{Conn: clientConn, w: t.dumpWriter} - dialOpts.HTTPClient = &http.Client{ - Transport: &http.Transport{ - DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { - return wrapped, nil - }, - }, + buffered := newAsyncWritePipeConn(clientConn) + wrapped := &teeConn{Conn: buffered, w: t.dumpWriter} + httpTransport.DialContext = func(_ context.Context, _, _ string) (net.Conn, error) { + return wrapped, nil } // Use a dummy URL so the WS library has something to parse. - wsURL = "ws://dump.local" + qwpWritePath + wsURL = "ws://dump.local" + path - // If Dial fails, close the pipe so the fake server goroutine exits. + // If Dial fails, close the buffered conn so the pump and fake + // server goroutines exit. On success the WebSocket owns wrapped + // and its Close path tears both down. defer func() { if t.conn == nil { - clientConn.Close() + buffered.Close() } }() } else if opts.tlsInsecureSkipVerify { // TLS configuration for wss:// connections. - dialOpts.HTTPClient = &http.Client{ - Transport: &http.Transport{ - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, - MinVersion: tls.VersionTLS12, - }, - }, + httpTransport.TLSClientConfig = &tls.Config{ + InsecureSkipVerify: true, + MinVersion: tls.VersionTLS12, } } + dialOpts.HTTPClient = &http.Client{Transport: httpTransport} conn, resp, err := websocket.Dial(ctx, wsURL, dialOpts) if err != nil { - if resp != nil && resp.Body != nil { - _ = resp.Body.Close() + // On a non-101 response, build a typed *QwpUpgradeRejectError + // from the captured status + headers so the failover loop can + // classify the host (role-reject / topology / transport) without + // re-parsing string error messages. resp may be nil for TCP/TLS + // dial failures or response-header timeouts; in that case fall + // back to the wrapped dial error. + if resp != nil { + rejectErr := buildUpgradeRejectError(resp, err) + resp.Body.Close() + return rejectErr } return fmt.Errorf("qwp: websocket dial: %w", err) } + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } // Validate the server-selected QWP version. Require the header to // be present and match our version — a missing header signals a @@ -179,22 +416,118 @@ func (t *qwpTransport) connect(ctx context.Context, url string, opts qwpTranspor conn.Close(websocket.StatusProtocolError, "missing version header") return fmt.Errorf("qwp: server did not return %s header", qwpHeaderVersion) } - if serverVersion != fmt.Sprintf("%d", qwpVersion) { + negotiated, err := strconv.Atoi(serverVersion) + if err != nil || negotiated < 1 || negotiated > int(advertisedMax) { conn.Close(websocket.StatusProtocolError, "version mismatch") - return fmt.Errorf("qwp: server selected protocol version %q, client supports %d", serverVersion, qwpVersion) + return fmt.Errorf("qwp: server selected protocol version %q, client supports up to %d", serverVersion, advertisedMax) } - // Remove the default read limit — QWP ACKs are small but - // error payloads can vary. - conn.SetReadLimit(-1) + // Raise — but do not remove — the default read limit. QWP ACKs are + // small, but egress RESULT_BATCH frames can reach qwpMaxBatchSize, + // so the 32 KiB default is too low. A finite ceiling (not -1) is + // load-bearing: this conn is shared by the egress reader and the + // ingest readAck path, and coder/websocket enforces the limit while + // streaming the message — a hostile or buggy server emitting a + // multi-GB frame is cut off mid-read instead of OOMing the host + // before any downstream size check runs. + conn.SetReadLimit(qwpMaxFrameReadLimit) t.conn = conn + t.negotiatedVersion = byte(negotiated) + // Parse the optional X-QWP-Max-Batch-Size advertisement. A + // non-positive or unparseable value is treated as "no cap": + // older servers that don't emit the header leave the configured + // auto_flush_bytes untouched. Mirrors Java + // WebSocketClient.extractMaxBatchSize. + if cap := resp.Header.Get(qwpHeaderMaxBatchSize); cap != "" { + if parsed, perr := strconv.Atoi(cap); perr == nil && parsed > 0 { + if parsed > math.MaxInt32 { + parsed = math.MaxInt32 + } + t.serverMaxBatchSize = int32(parsed) + } + } if t.recvBuf == nil { t.recvBuf = make([]byte, 0, qwpDefaultInitRecvBufSize) } + + // The egress endpoint appends a SERVER_INFO frame to the upgrade + // response (the read endpoint always emits it post-handshake), + // before any client request. Consume it synchronously so the I/O + // goroutines start with a clean recv queue and the user-visible + // ServerInfo() accessor is populated before submit. Egress + // connections opt in via opts.serverInfoTimeout > 0; the ingest + // endpoint sends no SERVER_INFO and the client never expects one, + // so ingest senders leave it zero and read ACKs directly. + if opts.serverInfoTimeout > 0 { + readCtx, cancel := context.WithTimeout(ctx, opts.serverInfoTimeout) + defer cancel() + msgType, payload, err := t.conn.Read(readCtx) + if err != nil { + t.conn.Close(websocket.StatusProtocolError, "SERVER_INFO read failed") + t.conn = nil + return fmt.Errorf("qwp: SERVER_INFO read failed: %w", err) + } + if msgType != websocket.MessageBinary { + t.conn.Close(websocket.StatusProtocolError, "SERVER_INFO non-binary") + t.conn = nil + return fmt.Errorf("qwp: expected SERVER_INFO binary frame, got %v", msgType) + } + info, err := decodeServerInfo(payload, t.negotiatedVersion) + if err != nil { + t.conn.Close(websocket.StatusProtocolError, "SERVER_INFO decode failed") + t.conn = nil + return fmt.Errorf("qwp: SERVER_INFO decode failed: %w", err) + } + t.serverInfo = info + } return nil } +// buildUpgradeRejectError snapshots the relevant fields of a non-101 +// upgrade response into a typed QwpUpgradeRejectError. Reads up to +// qwpUpgradeBodySnippetCap bytes of the body so the error message +// surfaces operator-supplied text (e.g. a reverse-proxy maintenance +// page) without unbounded memory cost. The caller is responsible for +// closing resp.Body once this returns. cause is the originating +// websocket.Dial error, retained so it is wrapped (not discarded) — +// notably when StatusCode is 101 but the upgrade still failed. +func buildUpgradeRejectError(resp *http.Response, cause error) *QwpUpgradeRejectError { + role := strings.TrimSpace(resp.Header.Get("X-QuestDB-Role")) + zone := strings.TrimSpace(resp.Header.Get("X-QuestDB-Zone")) + var retryAfter time.Duration + if ra := strings.TrimSpace(resp.Header.Get("Retry-After")); ra != "" { + // Per RFC 7231 §7.1.3, Retry-After is either an HTTP-date or a + // non-negative integer of seconds. We only honour the seconds + // form here — the failover loop's outage budget is the + // authoritative wait bound, so HTTP-date precision adds little. + if secs, perr := strconv.Atoi(ra); perr == nil && secs > 0 { + retryAfter = time.Duration(secs) * time.Second + } + } + var body string + if resp.Body != nil { + buf := make([]byte, qwpUpgradeBodySnippetCap+1) + n, _ := io.ReadFull(resp.Body, buf) + switch { + case n <= 0: + // no body or unreadable; leave empty + case n > qwpUpgradeBodySnippetCap: + body = strings.TrimSpace(string(buf[:qwpUpgradeBodySnippetCap])) + "…" + default: + body = strings.TrimSpace(string(buf[:n])) + } + } + return &QwpUpgradeRejectError{ + StatusCode: resp.StatusCode, + Role: role, + Zone: zone, + RetryAfter: retryAfter, + Body: body, + cause: cause, + } +} + // sendMessage sends a QWP message as a WebSocket binary frame. func (t *qwpTransport) sendMessage(ctx context.Context, data []byte) error { if t.conn == nil { @@ -205,18 +538,35 @@ func (t *qwpTransport) sendMessage(ctx context.Context, data []byte) error { // readAck reads and parses the server's ACK response. It returns // the status code and the full response payload (including the -// status byte). The payload is validated against the exact length -// required by §13: OK ACKs must be exactly qwpAckOKSize bytes, error -// ACKs must be exactly qwpAckErrorHeaderSize + msg_len bytes. This +// status byte). The payload is validated against the exact shape +// required by spec §13: OK and DURABLE_ACK frames carry per-table +// watermark entries and must consume the frame exactly; error frames +// must end exactly at status + sequence + msg_len + msg. This // mirrors the Java client's WebSocketResponse.isStructurallyValid -// and fails loudly on any unrecognized shape (e.g. a legacy PARTIAL -// response) instead of decoding it into garbage fields. +// and fails loudly on any unrecognized shape (e.g. a legacy 9-byte +// OK response) instead of decoding it into garbage fields. +// +// - OK ACKs are status(1) + sequence(8) + tableCount(2) + +// tableCount × (nameLen(2) + name + seqTxn(8)). Minimum 11 bytes; +// the trailing per-table entries section must consume the rest of +// the payload exactly. +// +// - DURABLE_ACK frames are unsolicited per-table watermarks. They +// are validated and returned to the caller with status +// QwpStatusDurableAck — the caller decides what to do with them +// (the cursor send loop ignores them and reads on). Servers only +// emit them when the client opts in via the X-QWP-Request-Durable- +// Ack header, which this transport does not set. +// +// - Error ACKs are exactly qwpAckErrorHeaderSize + msg_len bytes. // -// ACK layouts: +// OK: [status (0x00)] [sequence: int64 LE] [tableCount: uint16 LE] [entries…] +// DURABLE_ACK: [status (0x02)] [tableCount: uint16 LE] [entries…] +// Error: [status] [sequence: int64 LE] [msg_len: uint16 LE] [msg: UTF-8] // -// OK: [status: uint8 (0x00)] [sequence: int64 LE] -// Error: [status: uint8] [sequence: int64 LE] [msg_len: uint16 LE] [msg: UTF-8] -func (t *qwpTransport) readAck(ctx context.Context) (qwpStatusCode, []byte, error) { +// Each table entry is [nameLen: uint16 LE] [name (nameLen bytes UTF-8)] +// [seqTxn: int64 LE]. nameLen must be > 0 — empty names are rejected. +func (t *qwpTransport) readAck(ctx context.Context) (QwpStatusCode, []byte, error) { if t.conn == nil { return 0, nil, fmt.Errorf("qwp: not connected") } @@ -236,60 +586,112 @@ func (t *qwpTransport) readAck(ctx context.Context) (qwpStatusCode, []byte, erro break } } - if len(data) < qwpAckOKSize { + if len(data) < 1 { return 0, nil, fmt.Errorf("qwp: ack too short: %d bytes", len(data)) } - statusCode := qwpStatusCode(data[0]) - if statusCode == qwpStatusOK { - if len(data) != qwpAckOKSize { - return 0, nil, fmt.Errorf("qwp: malformed OK ack: got %d bytes, want %d", len(data), qwpAckOKSize) + statusCode := QwpStatusCode(data[0]) + switch statusCode { + case QwpStatusOK: + if len(data) < qwpAckOKMinSize { + return 0, nil, fmt.Errorf("qwp: malformed OK ack: got %d bytes, want at least %d", len(data), qwpAckOKMinSize) + } + if err := validateAckTableEntries(data[qwpAckOKTablesOffset:]); err != nil { + return 0, nil, fmt.Errorf("qwp: malformed OK ack: %w", err) + } + return statusCode, data, nil + case QwpStatusDurableAck: + if len(data) < qwpAckDurableMinSize { + return 0, nil, fmt.Errorf("qwp: malformed durable ack: got %d bytes, want at least %d", len(data), qwpAckDurableMinSize) + } + if err := validateAckTableEntries(data[qwpAckDurableTablesOff:]); err != nil { + return 0, nil, fmt.Errorf("qwp: malformed durable ack: %w", err) } return statusCode, data, nil } + // Error frame. if len(data) < qwpAckErrorHeaderSize { return 0, nil, fmt.Errorf("qwp: malformed error ack: got %d bytes, want at least %d", len(data), qwpAckErrorHeaderSize) } - msgLen := int(binary.LittleEndian.Uint16(data[9:11])) + msgLen := int(binary.LittleEndian.Uint16(data[qwpAckErrorMsgLenOffset : qwpAckErrorMsgLenOffset+2])) if len(data) != qwpAckErrorHeaderSize+msgLen { return 0, nil, fmt.Errorf("qwp: malformed error ack: status=0x%02X, got %d bytes, want %d", byte(statusCode), len(data), qwpAckErrorHeaderSize+msgLen) } return statusCode, data, nil } -// parseAckError extracts an error message from a non-OK ACK payload. -// The layout is: +// validateAckTableEntries walks the per-table watermark trailer of an +// OK or DURABLE_ACK frame and checks that its declared length consumes +// the buffer exactly. Returns nil on success or a descriptive error +// for any truncation, lying-length entry, empty table name, or +// trailing garbage. +func validateAckTableEntries(tail []byte) error { + if len(tail) < 2 { + return fmt.Errorf("missing table count") + } + tableCount := int(binary.LittleEndian.Uint16(tail[0:2])) + off := 2 + for i := 0; i < tableCount; i++ { + if len(tail) < off+2 { + return fmt.Errorf("truncated table entry %d (header)", i) + } + nameLen := int(binary.LittleEndian.Uint16(tail[off : off+2])) + off += 2 + // Empty names indicate a corrupt or hostile payload — match + // the Java client and reject them. A valid table name is + // never zero bytes. + if nameLen == 0 { + return fmt.Errorf("empty table name in entry %d", i) + } + if len(tail) < off+nameLen+8 { + return fmt.Errorf("truncated table entry %d (body)", i) + } + off += nameLen + 8 + } + if off != len(tail) { + return fmt.Errorf("trailing %d bytes after %d table entries", len(tail)-off, tableCount) + } + return nil +} + +// parseAckError extracts an error message from a non-OK, non-durable +// ACK payload. The layout is: // // [statusCode: uint8] [sequence: int64 LE] [errorLength: uint16 LE] [errorMessage: UTF-8] // // Precondition: data has already been validated by readAck, which -// guarantees at least qwpAckErrorHeaderSize bytes for non-OK statuses +// guarantees at least qwpAckErrorHeaderSize bytes for error statuses // and that the trailing bytes match the declared errorLength. func parseAckError(data []byte) string { - const errLenOffset = 9 // 1 (status) + 8 (sequence) - const errMsgOffset = 11 // errLenOffset + 2 (uint16) - errLen := int(binary.LittleEndian.Uint16(data[errLenOffset:errMsgOffset])) - return string(data[errMsgOffset : errMsgOffset+errLen]) + errLen := int(binary.LittleEndian.Uint16(data[qwpAckErrorMsgLenOffset : qwpAckErrorMsgLenOffset+2])) + start := qwpAckErrorHeaderSize + return string(data[start : start+errLen]) } // parseAckSequence extracts the cumulative sequence number from an -// ACK payload. The wire field is signed (int64 LE) and uses -1 as -// a sentinel; matches Java's long semantics. +// OK or error ACK payload. The wire field is signed (int64 LE) and +// uses -1 as a sentinel; matches Java's long semantics. DURABLE_ACK +// frames have no sequence — callers must skip them before calling. // -// Precondition: data has already been validated by readAck, which -// guarantees at least qwpAckOKSize bytes. +// Precondition: data has already been validated by readAck. func parseAckSequence(data []byte) int64 { - return int64(binary.LittleEndian.Uint64(data[1:9])) + return int64(binary.LittleEndian.Uint64(data[qwpAckSequenceOffset : qwpAckSequenceOffset+8])) } -// close sends a graceful WebSocket close frame and cleans up. -func (t *qwpTransport) close(ctx context.Context) error { +// close shuts the WebSocket down with a graceful close frame. Idempotent +// and safe to call concurrently with the egress reader/dispatcher: it +// closes the conn — which unblocks and errors their in-flight Read/Write +// — but never mutates the conn field, so it cannot race their lock-free +// reads of it. coder/websocket's Conn.Close is itself safe under +// concurrent and repeated calls; closeOnce additionally pins one result. +func (t *qwpTransport) close() error { if t.conn == nil { return nil } - err := t.conn.Close(websocket.StatusNormalClosure, "") - t.conn = nil - return err + t.closeOnce.Do(func() { + t.closeErr = t.conn.Close(websocket.StatusNormalClosure, "") + }) + return t.closeErr } // --- fake server for dump mode --- @@ -341,6 +743,9 @@ func qwpFakeServer(conn net.Conn) { } // --- WebSocket frame loop --- + // seq is the next batch's cumulative ACK sequence, 0-based: the + // first batch (FSN 0) is acked as sequence 0, matching the real + // server and the producer's FSN numbering. var seq uint64 var hdr [14]byte // max WS header size for { @@ -386,37 +791,23 @@ func qwpFakeServer(conn net.Conn) { conn.Write([]byte{0x88, 0x02, 0x03, 0xE8}) return case 0x02: // Binary frame — send QWP OK ACK. - seq++ - var ack [11]byte - // Unmasked binary frame: FIN+BINARY=0x82, length=9. + var ack [13]byte + // Unmasked binary frame: FIN+BINARY=0x82, payload length=11. ack[0] = 0x82 - ack[1] = 0x09 - // Payload: status OK (0x00) + sequence (uint64 LE). + ack[1] = 0x0B + // Payload: status OK (0x00) + cumulative sequence (uint64 LE) + // + tableCount=0 (uint16 LE). The 2-byte zero-table-count + // trailer is required by the QWP §13 OK ACK shape. The + // sequence is 0-based and built before the post-increment so + // dump mode exercises the same ACK path as production. ack[2] = 0x00 // STATUS_OK binary.LittleEndian.PutUint64(ack[3:], seq) + binary.LittleEndian.PutUint16(ack[11:], 0) if _, err := conn.Write(ack[:]); err != nil { return } + seq++ } // Ignore other opcodes (ping/pong handled by WS library). } } - -// sendAndAck sends a QWP message and reads exactly one ACK. -// Returns nil on OK, a *QwpError for server-side rejections, or a -// transport error on connection failure. No retry: the spec defines -// no retriable status, so any non-OK response is terminal. -func (t *qwpTransport) sendAndAck(ctx context.Context, sendFn func() []byte) error { - msg := sendFn() - if err := t.sendMessage(ctx, msg); err != nil { - return err - } - _, data, err := t.readAck(ctx) - if err != nil { - return err - } - if qErr := newQwpErrorFromAck(data); qErr != nil { - return qErr - } - return nil -} diff --git a/qwp_transport_test.go b/qwp_transport_test.go index 1c2ad344..762d6eb2 100644 --- a/qwp_transport_test.go +++ b/qwp_transport_test.go @@ -28,6 +28,9 @@ import ( "bytes" "context" "encoding/binary" + "errors" + "fmt" + "net" "net/http" "net/http/httptest" "strings" @@ -41,16 +44,72 @@ import ( // --- Unit tests for ACK parsing --- -// buildAckOK builds a minimal OK ACK response (9 bytes). +// buildAckOK builds a minimal OK ACK response (11 bytes — status + +// sequence + tableCount=0, no per-table entries). func buildAckOK(seq int64) []byte { - data := make([]byte, 9) - data[0] = byte(qwpStatusOK) + data := make([]byte, qwpAckOKMinSize) + data[0] = byte(QwpStatusOK) binary.LittleEndian.PutUint64(data[1:9], uint64(seq)) + binary.LittleEndian.PutUint16(data[9:11], 0) return data } +// buildAckOKWithTables builds an OK ACK whose tail carries one or +// more per-table watermark entries (nameLen + name + seqTxn). Used by +// tests that exercise the new OK-with-watermark wire shape. +func buildAckOKWithTables(seq int64, entries ...struct { + name string + seqTxn int64 +}) []byte { + tail := encodeAckTableEntries(entries) + data := make([]byte, 11+len(tail)) + data[0] = byte(QwpStatusOK) + binary.LittleEndian.PutUint64(data[1:9], uint64(seq)) + binary.LittleEndian.PutUint16(data[9:11], uint16(len(entries))) + copy(data[11:], tail) + return data +} + +// buildAckDurable builds a STATUS_DURABLE_ACK response (status + +// tableCount + entries). +func buildAckDurable(entries ...struct { + name string + seqTxn int64 +}) []byte { + tail := encodeAckTableEntries(entries) + data := make([]byte, 3+len(tail)) + data[0] = byte(QwpStatusDurableAck) + binary.LittleEndian.PutUint16(data[1:3], uint16(len(entries))) + copy(data[3:], tail) + return data +} + +// encodeAckTableEntries serializes per-table watermark entries +// (nameLen(2) + name + seqTxn(8)) without the leading tableCount. +// Caller is responsible for prepending tableCount. +func encodeAckTableEntries(entries []struct { + name string + seqTxn int64 +}) []byte { + size := 0 + for _, e := range entries { + size += 2 + len(e.name) + 8 + } + out := make([]byte, size) + off := 0 + for _, e := range entries { + binary.LittleEndian.PutUint16(out[off:off+2], uint16(len(e.name))) + off += 2 + copy(out[off:], e.name) + off += len(e.name) + binary.LittleEndian.PutUint64(out[off:off+8], uint64(e.seqTxn)) + off += 8 + } + return out +} + // buildAckError builds an error ACK response with message. -func buildAckError(status qwpStatusCode, seq int64, errMsg string) []byte { +func buildAckError(status QwpStatusCode, seq int64, errMsg string) []byte { data := make([]byte, 11+len(errMsg)) data[0] = byte(status) binary.LittleEndian.PutUint64(data[1:9], uint64(seq)) @@ -66,7 +125,7 @@ func buildAckError(status qwpStatusCode, seq int64, errMsg string) []byte { func TestQwpParseAckError(t *testing.T) { t.Run("ErrorWithMessage", func(t *testing.T) { errMsg := "bad data" - data := buildAckError(qwpStatusParseError, 1, errMsg) + data := buildAckError(QwpStatusParseError, 1, errMsg) msg := parseAckError(data) if msg != errMsg { @@ -75,7 +134,7 @@ func TestQwpParseAckError(t *testing.T) { }) t.Run("EmptyErrorMessage", func(t *testing.T) { - data := buildAckError(qwpStatusInternalError, 2, "") + data := buildAckError(QwpStatusInternalError, 2, "") msg := parseAckError(data) if msg != "" { t.Fatalf("expected empty, got %q", msg) @@ -83,12 +142,12 @@ func TestQwpParseAckError(t *testing.T) { }) t.Run("AllStatusCodes", func(t *testing.T) { - codes := []qwpStatusCode{ - qwpStatusSchemaMismatch, - qwpStatusParseError, - qwpStatusInternalError, - qwpStatusSecurityError, - qwpStatusWriteError, + codes := []QwpStatusCode{ + QwpStatusSchemaMismatch, + QwpStatusParseError, + QwpStatusInternalError, + QwpStatusSecurityError, + QwpStatusWriteError, } for _, code := range codes { errMsg := "error for status" @@ -111,7 +170,7 @@ func TestQwpParseAckSequence(t *testing.T) { } // Error response should also have sequence. - dataErr := buildAckError(qwpStatusParseError, 99, "err") + dataErr := buildAckError(QwpStatusParseError, 99, "err") seq = parseAckSequence(dataErr) if seq != 99 { t.Fatalf("sequence = %d, want 99", seq) @@ -132,7 +191,7 @@ func TestQwpTransportNotConnected(t *testing.T) { } // close on unconnected should be no-op. - if err := tr.close(context.Background()); err != nil { + if err := tr.close(); err != nil { t.Fatalf("close on unconnected: %v", err) } } @@ -153,6 +212,48 @@ func newTestWSServer(t *testing.T, handler func(*websocket.Conn)) *httptest.Serv })) } +// newTestWSServerV2 echoes the negotiated version as the X-QWP-Version +// response header (default qwpVersion; override via opts.version), and +// when serverInfoFrame is non-nil writes it as the first WebSocket +// binary frame after the upgrade. The caller-supplied handler runs +// after the SERVER_INFO frame is sent so tests can drive arbitrary +// post-handshake choreography. +func newTestWSServerV2(t *testing.T, opts testWSServerV2Opts, handler func(*websocket.Conn)) *httptest.Server { + t.Helper() + version := opts.version + if version == 0 { + version = qwpVersion + } + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(qwpHeaderVersion, fmt.Sprintf("%d", version)) + conn, err := websocket.Accept(w, r, nil) + if err != nil { + t.Logf("websocket accept error: %v", err) + return + } + defer conn.CloseNow() + if opts.serverInfoFrame != nil { + if err := conn.Write(r.Context(), websocket.MessageBinary, opts.serverInfoFrame); err != nil { + t.Logf("server: SERVER_INFO write error: %v", err) + return + } + } + if handler != nil { + handler(conn) + } + })) +} + +type testWSServerV2Opts struct { + // version is the value echoed in X-QWP-Version. Zero defaults to + // qwpMaxSupportedVersion. + version byte + // serverInfoFrame, when non-nil, is written as the first binary + // frame after the upgrade. Built via buildServerInfoFrame in + // qwp_server_info_test.go. + serverInfoFrame []byte +} + func TestQwpTransportConnectAndClose(t *testing.T) { srv := newTestWSServer(t, func(conn *websocket.Conn) { // Echo server: just wait for close. @@ -169,7 +270,7 @@ func TestQwpTransportConnectAndClose(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}) + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}) if err != nil { t.Fatalf("connect: %v", err) } @@ -178,12 +279,24 @@ func TestQwpTransportConnectAndClose(t *testing.T) { t.Fatal("conn should not be nil after connect") } - err = tr.close(context.Background()) + err = tr.close() if err != nil { t.Fatalf("close: %v", err) } - if tr.conn != nil { - t.Fatal("conn should be nil after close") + // close() shuts the connection down but deliberately leaves the conn + // field intact — it is immutable after connect so the egress reader + // and dispatcher can read it lock-free without racing a concurrent + // close (see qwpTransport.conn). The connection is nonetheless dead: + // I/O on it errors. + if tr.conn == nil { + t.Fatal("conn should be retained after close (immutable post-connect)") + } + if err := tr.sendMessage(context.Background(), []byte{0x00}); err == nil { + t.Fatal("sendMessage should fail on a closed connection") + } + // close() is idempotent: a repeat call returns the same nil result. + if err := tr.close(); err != nil { + t.Fatalf("second close: %v", err) } } @@ -210,10 +323,10 @@ func TestQwpTransportNegotiationHeaders(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatalf("connect: %v", err) } - defer tr.close(context.Background()) + defer tr.close() if gotMaxVersion != "1" { t.Errorf("X-QWP-Max-Version = %q, want %q", gotMaxVersion, "1") @@ -243,10 +356,10 @@ func TestQwpTransportVersionMatchAccepted(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatalf("connect: %v", err) } - defer tr.close(context.Background()) + defer tr.close() } // TestQwpTransportVersionMissingRejected verifies that a server response @@ -269,9 +382,9 @@ func TestQwpTransportVersionMissingRejected(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}) + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}) if err == nil { - tr.close(context.Background()) + tr.close() t.Fatal("expected missing-version error") } if !strings.Contains(err.Error(), qwpHeaderVersion) { @@ -302,9 +415,9 @@ func TestQwpTransportVersionMismatchRejected(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}) + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}) if err == nil { - tr.close(context.Background()) + tr.close() t.Fatal("expected version mismatch error") } if !strings.Contains(err.Error(), "version") { @@ -315,6 +428,117 @@ func TestQwpTransportVersionMismatchRejected(t *testing.T) { } } +// TestQwpTransportNegotiationConsumesServerInfo verifies that an +// egress-style connection reads the SERVER_INFO frame the server emits +// post-upgrade, and exposes the decoded fields via tr.serverInfo. The +// recv buffer must be clean for follow-up frames. +func TestQwpTransportNegotiationConsumesServerInfo(t *testing.T) { + frame := buildServerInfoFrame(qwpVersion, 0, + qwpRolePrimary, 17, 0, 1234567890, "alpha", "node-A") + srv := newTestWSServerV2(t, testWSServerV2Opts{ + serverInfoFrame: frame, + }, func(conn *websocket.Conn) { + // Stay alive so the client can close cleanly. + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{ + endpointPath: qwpReadPath, + maxVersion: qwpVersion, + serverInfoTimeout: 2 * time.Second, + }) + if err != nil { + t.Fatalf("connect: %v", err) + } + defer tr.close() + + if tr.negotiatedVersion != qwpVersion { + t.Errorf("negotiatedVersion = %d, want %d", + tr.negotiatedVersion, qwpVersion) + } + if tr.serverInfo == nil { + t.Fatal("serverInfo should be populated on egress connection") + } + if tr.serverInfo.Role != qwpRolePrimary { + t.Errorf("Role = 0x%02X, want PRIMARY", tr.serverInfo.Role) + } + if tr.serverInfo.NodeId != "node-A" { + t.Errorf("NodeId = %q, want node-A", tr.serverInfo.NodeId) + } +} + +// TestQwpTransportNegotiationDecodeFailureClosesConn ensures that a +// malformed SERVER_INFO frame surfaces as a connect-time error and +// nils tr.conn, so callers see a clean failure rather than a partly +// usable transport. +func TestQwpTransportNegotiationDecodeFailureClosesConn(t *testing.T) { + srv := newTestWSServerV2(t, testWSServerV2Opts{ + serverInfoFrame: []byte{0xDE, 0xAD, 0xBE, 0xEF}, // not a valid frame + }, func(conn *websocket.Conn) { + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{ + endpointPath: qwpReadPath, + maxVersion: qwpVersion, + serverInfoTimeout: 2 * time.Second, + }) + if err == nil { + tr.close() + t.Fatal("expected SERVER_INFO decode error") + } + if !strings.Contains(err.Error(), "SERVER_INFO") { + t.Errorf("error = %v, want SERVER_INFO", err) + } + if tr.conn != nil { + t.Error("conn must be nil after failed SERVER_INFO read") + } +} + +// TestQwpTransportNegotiationTimeout verifies that a stalled server +// (one that never emits SERVER_INFO) trips the bounded timeout. +func TestQwpTransportNegotiationTimeout(t *testing.T) { + srv := newTestWSServerV2(t, testWSServerV2Opts{ + // Don't emit SERVER_INFO at all; just keep the conn open. + }, func(conn *websocket.Conn) { + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{ + endpointPath: qwpReadPath, + maxVersion: qwpVersion, + serverInfoTimeout: 50 * time.Millisecond, + }) + if err == nil { + tr.close() + t.Fatal("expected SERVER_INFO timeout error") + } + if !strings.Contains(err.Error(), "SERVER_INFO") { + t.Errorf("error = %v, want SERVER_INFO timeout", err) + } +} + func TestQwpTransportSendAndReceive(t *testing.T) { srv := newTestWSServer(t, func(conn *websocket.Conn) { // Read a message, reply with ACK OK. @@ -340,10 +564,10 @@ func TestQwpTransportSendAndReceive(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatalf("connect: %v", err) } - defer tr.close(context.Background()) + defer tr.close() // Build a simple QWP message. tb := newQwpTableBuffer("test") @@ -352,7 +576,7 @@ func TestQwpTransportSendAndReceive(t *testing.T) { tb.commitRow() var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) + msg := enc.encodeTable(tb) // Send. if err := tr.sendMessage(context.Background(), msg); err != nil { @@ -364,7 +588,7 @@ func TestQwpTransportSendAndReceive(t *testing.T) { if err != nil { t.Fatalf("readAck: %v", err) } - if status != qwpStatusOK { + if status != QwpStatusOK { t.Fatalf("status = 0x%02X, want 0x00 (OK)", status) } } @@ -375,7 +599,7 @@ func TestQwpTransportAckWithError(t *testing.T) { // Read message, reply with error ACK. conn.Read(context.Background()) - ack := buildAckError(qwpStatusWriteError, 1, errMsg) + ack := buildAckError(QwpStatusWriteError, 1, errMsg) conn.Write(context.Background(), websocket.MessageBinary, ack) }) defer srv.Close() @@ -383,10 +607,10 @@ func TestQwpTransportAckWithError(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatalf("connect: %v", err) } - defer tr.close(context.Background()) + defer tr.close() // Send dummy message. if err := tr.sendMessage(context.Background(), []byte{0x00}); err != nil { @@ -397,7 +621,7 @@ func TestQwpTransportAckWithError(t *testing.T) { if err != nil { t.Fatalf("readAck: %v", err) } - if status != qwpStatusWriteError { + if status != QwpStatusWriteError { t.Fatalf("status = 0x%02X, want 0x09", status) } @@ -407,109 +631,16 @@ func TestQwpTransportAckWithError(t *testing.T) { } } -// --- Integration test against real QuestDB server --- - -func TestQwpIntegrationConnect(t *testing.T) { - // Skip if QuestDB is not running at localhost:9000. - ctx := context.Background() - - var tr qwpTransport - err := tr.connect(ctx, "ws://localhost:9000", qwpTransportOpts{}) - if err != nil { - t.Skipf("QuestDB not available: %v", err) - } - defer tr.close(ctx) - - // Send a simple QWP message with delta symbol dict (required - // by the server for symbol columns) and verify the ACK. - tb := newQwpTableBuffer("qwp_transport_test") - col, _ := tb.getOrCreateColumn("value", qwpTypeLong, false) - col.addLong(42) - colTs, _ := tb.getOrCreateColumn("ts", qwpTypeTimestamp, false) - colTs.addTimestamp(1000000) - tb.commitRow() - - var enc qwpEncoder - msg := enc.encodeTable(tb, qwpSchemaModeFull, 0) - - t.Logf("sending QWP message (%d bytes): %x", len(msg), msg) - - if err := tr.sendMessage(ctx, msg); err != nil { - t.Fatalf("sendMessage: %v", err) - } - - status, data, err := tr.readAck(ctx) - if err != nil { - t.Fatalf("readAck: %v", err) - } - - if status != qwpStatusOK { - errStr := parseAckError(data) - t.Logf("raw ACK response (%d bytes): %x", len(data), data) - t.Fatalf("expected OK, got status 0x%02X: %s", status, errStr) - } - t.Logf("ACK OK, sequence=%d", parseAckSequence(data)) -} - -// --- sendAndAck tests --- - -func TestQwpTransportSendAndAckSuccess(t *testing.T) { - srv := newTestWSServer(t, func(conn *websocket.Conn) { - conn.Read(context.Background()) - conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(0)) - }) - defer srv.Close() - - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer tr.close(context.Background()) - - msg := []byte{0x51, 0x57, 0x50, 0x31} // dummy - if err := tr.sendAndAck(context.Background(), func() []byte { return msg }); err != nil { - t.Fatalf("sendAndAck: %v", err) - } -} - -func TestQwpTransportSendAndAckServerError(t *testing.T) { - srv := newTestWSServer(t, func(conn *websocket.Conn) { - conn.Read(context.Background()) - ack := buildAckError(qwpStatusParseError, 0, "bad message") - conn.Write(context.Background(), websocket.MessageBinary, ack) - }) - defer srv.Close() - - wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") - var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { - t.Fatal(err) - } - defer tr.close(context.Background()) - - err := tr.sendAndAck(context.Background(), func() []byte { return []byte{0x00} }) - if err == nil { - t.Fatal("expected error") - } - qErr, ok := err.(*QwpError) - if !ok { - t.Fatalf("expected *QwpError, got %T", err) - } - if qErr.Status != qwpStatusParseError { - t.Fatalf("status = %d, want %d", qErr.Status, qwpStatusParseError) - } -} - // --- Strict ACK validation tests (mirror Java isStructurallyValid) --- // TestReadAckRejectsOversizedOK ensures readAck fails loudly when an OK -// response carries trailing garbage beyond the fixed 9-byte shape. +// response carries trailing garbage past the per-table entries section. func TestReadAckRejectsOversizedOK(t *testing.T) { srv := newTestWSServer(t, func(conn *websocket.Conn) { conn.Read(context.Background()) - // buildAckOK produces 9 bytes; pad with one extra byte so the - // length no longer matches qwpAckOKSize. + // buildAckOK produces an 11-byte OK with tableCount=0; pad + // with one extra byte so the trailing entries section no + // longer ends exactly at len(data). ack := append(buildAckOK(0), 0x00) conn.Write(context.Background(), websocket.MessageBinary, ack) }) @@ -517,10 +648,10 @@ func TestReadAckRejectsOversizedOK(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatal(err) } - defer tr.close(context.Background()) + defer tr.close() if err := tr.sendMessage(context.Background(), []byte{0x00}); err != nil { t.Fatal(err) @@ -541,7 +672,7 @@ func TestReadAckRejectsErrorLengthMismatch(t *testing.T) { conn.Read(context.Background()) // Build an error ACK claiming msg_len=10 but carrying only 5 msg bytes. ack := make([]byte, 16) - ack[0] = byte(qwpStatusWriteError) + ack[0] = byte(QwpStatusWriteError) binary.LittleEndian.PutUint64(ack[1:9], 0) binary.LittleEndian.PutUint16(ack[9:11], 10) copy(ack[11:], "short") // only 5 bytes, not 10 @@ -551,10 +682,10 @@ func TestReadAckRejectsErrorLengthMismatch(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatal(err) } - defer tr.close(context.Background()) + defer tr.close() if err := tr.sendMessage(context.Background(), []byte{0x00}); err != nil { t.Fatal(err) @@ -585,10 +716,10 @@ func TestReadAckSkipsTextFrames(t *testing.T) { wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") var tr qwpTransport - if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{}); err != nil { + if err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}); err != nil { t.Fatal(err) } - defer tr.close(context.Background()) + defer tr.close() if err := tr.sendMessage(context.Background(), []byte{0x00}); err != nil { t.Fatal(err) @@ -597,7 +728,166 @@ func TestReadAckSkipsTextFrames(t *testing.T) { if err != nil { t.Fatalf("readAck: %v", err) } - if status != qwpStatusOK { + if status != QwpStatusOK { + t.Fatalf("status = 0x%02X, want OK", status) + } + if seq := parseAckSequence(data); seq != 7 { + t.Fatalf("sequence = %d, want 7", seq) + } +} + +// TestQwpTransportEgressUpgrade exercises the opts.endpointPath, +// opts.acceptEncoding, and opts.maxBatchRows fields wired in step 6. +// Each subtest inspects the HTTP upgrade request the transport sends, +// then lets the WebSocket handshake complete so connect() returns. +func TestQwpTransportEgressUpgrade(t *testing.T) { + type reqSnapshot struct { + path string + acceptEncoding string + maxBatchRows string + hasAcceptEnc bool + hasMaxRows bool + } + + newServer := func(capture *reqSnapshot) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capture.path = r.URL.Path + capture.acceptEncoding = r.Header.Get(qwpHeaderAcceptEncoding) + capture.maxBatchRows = r.Header.Get(qwpHeaderMaxBatchRows) + // Values() canonicalizes the key internally, so we can + // probe for header presence without assuming what the + // canonical form of "X-QWP-*" happens to be. + capture.hasAcceptEnc = len(r.Header.Values(qwpHeaderAcceptEncoding)) > 0 + capture.hasMaxRows = len(r.Header.Values(qwpHeaderMaxBatchRows)) > 0 + w.Header().Set(qwpHeaderVersion, "1") + conn, err := websocket.Accept(w, r, nil) + if err != nil { + return + } + defer conn.CloseNow() + for { + if _, _, err := conn.Read(context.Background()); err != nil { + return + } + } + })) + } + + t.Run("ReadPathWithBothEgressHeaders", func(t *testing.T) { + var got reqSnapshot + srv := newServer(&got) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + opts := qwpTransportOpts{ + endpointPath: qwpReadPath, + acceptEncoding: "zstd;level=3,raw", + maxBatchRows: 10_000, + } + require.NoError(t, tr.connect(context.Background(), wsURL, opts)) + defer tr.close() + + assert.Equal(t, qwpReadPath, got.path) + assert.Equal(t, "zstd;level=3,raw", got.acceptEncoding) + assert.Equal(t, "10000", got.maxBatchRows) + }) + + t.Run("IngestPathStampsNoEgressHeaders", func(t *testing.T) { + var got reqSnapshot + srv := newServer(&got) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + opts := qwpTransportOpts{endpointPath: qwpWritePath} + require.NoError(t, tr.connect(context.Background(), wsURL, opts)) + defer tr.close() + + assert.Equal(t, qwpWritePath, got.path) + assert.False(t, got.hasAcceptEnc, "accept-encoding must be omitted on ingest") + assert.False(t, got.hasMaxRows, "max-batch-rows must be omitted on ingest") + }) + + t.Run("EmptyEndpointPathRejected", func(t *testing.T) { + // No server needed — the empty-path check short-circuits before + // any network I/O so the call never leaves the process. + var tr qwpTransport + err := tr.connect(context.Background(), "ws://unused", qwpTransportOpts{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "endpointPath is required") + assert.Nil(t, tr.conn) + }) + + t.Run("EmptyAcceptEncodingOmitsHeader", func(t *testing.T) { + var got reqSnapshot + srv := newServer(&got) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + opts := qwpTransportOpts{ + endpointPath: qwpReadPath, + acceptEncoding: "", + maxBatchRows: 0, + } + require.NoError(t, tr.connect(context.Background(), wsURL, opts)) + defer tr.close() + + assert.Equal(t, qwpReadPath, got.path) + assert.False(t, got.hasAcceptEnc, "empty acceptEncoding must omit header") + assert.False(t, got.hasMaxRows, "zero maxBatchRows must omit header") + }) + + t.Run("MaxBatchRowsOnlyOmitsAcceptEncoding", func(t *testing.T) { + var got reqSnapshot + srv := newServer(&got) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + opts := qwpTransportOpts{ + endpointPath: qwpReadPath, + maxBatchRows: 1, + } + require.NoError(t, tr.connect(context.Background(), wsURL, opts)) + defer tr.close() + + assert.False(t, got.hasAcceptEnc) + assert.Equal(t, "1", got.maxBatchRows) + }) +} + +// TestReadAckOKWithTableEntries exercises the new OK ACK shape that +// carries per-table watermark entries (status + seq + tableCount + +// [nameLen + name + seqTxn] * tableCount). The wire frame for one +// 19-char table name lands at exactly 42 bytes — this is the size +// the live QuestDB server returns for typical SF write paths. +func TestReadAckOKWithTableEntries(t *testing.T) { + srv := newTestWSServer(t, func(conn *websocket.Conn) { + conn.Read(context.Background()) + ack := buildAckOKWithTables(7, + struct { + name string + seqTxn int64 + }{"my_test_table_xxxxx", 100}, + ) + // Sanity: this is the 42-byte ACK shape from the live server. + // 11 (header) + 2 (nameLen) + 19 (name) + 8 (seqTxn) = 40. + // Adjust if the helper layout ever changes. + conn.Write(context.Background(), websocket.MessageBinary, ack) + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + require.NoError(t, tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath})) + defer tr.close() + + require.NoError(t, tr.sendMessage(context.Background(), []byte{0x00})) + status, data, err := tr.readAck(context.Background()) + require.NoError(t, err) + if status != QwpStatusOK { t.Fatalf("status = 0x%02X, want OK", status) } if seq := parseAckSequence(data); seq != 7 { @@ -605,22 +895,114 @@ func TestReadAckSkipsTextFrames(t *testing.T) { } } +// TestReadAckDurableAck verifies that DURABLE_ACK frames pass the +// validator, are returned with the correct status code, and don't +// trip the OK / error decoders. +func TestReadAckDurableAck(t *testing.T) { + srv := newTestWSServer(t, func(conn *websocket.Conn) { + conn.Read(context.Background()) + conn.Write(context.Background(), websocket.MessageBinary, + buildAckDurable(struct { + name string + seqTxn int64 + }{"durable_table", 42})) + // Followed by a normal OK terminator so the test has something + // to return after the durable-ack tail. + conn.Write(context.Background(), websocket.MessageBinary, buildAckOK(0)) + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + require.NoError(t, tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath})) + defer tr.close() + + require.NoError(t, tr.sendMessage(context.Background(), []byte{0x00})) + status, _, err := tr.readAck(context.Background()) + require.NoError(t, err) + if status != QwpStatusDurableAck { + t.Fatalf("status = 0x%02X, want DURABLE_ACK", status) + } +} + +// TestReadAckRejectsTruncatedTableEntry confirms that an OK frame +// whose tableCount declares N entries but whose body terminates early +// is rejected as malformed. +func TestReadAckRejectsTruncatedTableEntry(t *testing.T) { + srv := newTestWSServer(t, func(conn *websocket.Conn) { + conn.Read(context.Background()) + // Build an OK frame with tableCount=1 but no entry bytes. + ack := make([]byte, 11) + ack[0] = byte(QwpStatusOK) + binary.LittleEndian.PutUint64(ack[1:9], 0) + binary.LittleEndian.PutUint16(ack[9:11], 1) // claims 1 entry + conn.Write(context.Background(), websocket.MessageBinary, ack) + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + require.NoError(t, tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath})) + defer tr.close() + + require.NoError(t, tr.sendMessage(context.Background(), []byte{0x00})) + _, _, err := tr.readAck(context.Background()) + if err == nil { + t.Fatal("expected malformed-OK error for truncated table entry") + } + if !strings.Contains(err.Error(), "malformed OK") { + t.Fatalf("error should mention 'malformed OK', got: %v", err) + } +} + +// TestReadAckRejectsEmptyTableName confirms that a per-table entry +// with nameLen=0 is rejected. Mirrors the Java client's +// validateTableEntries guard. +func TestReadAckRejectsEmptyTableName(t *testing.T) { + srv := newTestWSServer(t, func(conn *websocket.Conn) { + conn.Read(context.Background()) + // OK frame with one entry: nameLen=0, seqTxn=0. The validator + // must reject this even though the byte count adds up. + ack := make([]byte, 11+2+8) + ack[0] = byte(QwpStatusOK) + binary.LittleEndian.PutUint64(ack[1:9], 0) + binary.LittleEndian.PutUint16(ack[9:11], 1) + binary.LittleEndian.PutUint16(ack[11:13], 0) // nameLen=0 + binary.LittleEndian.PutUint64(ack[13:21], 0) // seqTxn + conn.Write(context.Background(), websocket.MessageBinary, ack) + }) + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + require.NoError(t, tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath})) + defer tr.close() + + require.NoError(t, tr.sendMessage(context.Background(), []byte{0x00})) + _, _, err := tr.readAck(context.Background()) + if err == nil { + t.Fatal("expected malformed-OK error for empty table name") + } + if !strings.Contains(err.Error(), "empty table name") { + t.Fatalf("error should mention 'empty table name', got: %v", err) + } +} + func TestQwpDumpWriter(t *testing.T) { var buf bytes.Buffer ctx := context.Background() - s, err := newQwpLineSender(ctx, "", qwpTransportOpts{}, 0, 0, 0, &buf) + s, err := newQwpLineSender(ctx, "", qwpTransportOpts{endpointPath: qwpWritePath}, 0, 0, &buf) require.NoError(t, err) - // Insert a row and flush. + // Insert a row and flush — exercises the full sender pipeline so + // the dump captures both the HTTP upgrade and at least one + // WebSocket binary frame round-trip. s.Table("test_dump").Int64Column("val", 42) require.NoError(t, s.At(ctx, time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC))) require.NoError(t, s.Flush(ctx)) require.NoError(t, s.Close(ctx)) - // The dump should start with the HTTP upgrade request. - // Go's HTTP client lowercases some header names, so check - // case-insensitively where needed. dump := buf.String() assert.Contains(t, dump, "GET /write/v4 HTTP/1.1\r\n") assert.Contains(t, dump, "Upgrade: websocket\r\n") @@ -630,3 +1012,297 @@ func TestQwpDumpWriter(t *testing.T) { require.Greater(t, httpEnd, 0) assert.Greater(t, len(dump), httpEnd+4, "expected WebSocket frames after HTTP upgrade") } + +// newUpgradeRejectServer returns an httptest.Server that responds to +// every request with the given status, headers, and body. Used to +// drive the qwpTransport.connect() reject-classification paths without +// running a real WebSocket accept. +func newUpgradeRejectServer(t *testing.T, status int, headers http.Header, body string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + for k, vs := range headers { + for _, v := range vs { + w.Header().Add(k, v) + } + } + w.WriteHeader(status) + if body != "" { + _, _ = w.Write([]byte(body)) + } + })) +} + +// connectUpgradeReject is the shared assertion: drive connect() against +// the given server and require a *QwpUpgradeRejectError. Returns the +// typed error so callers can verify its fields. +func connectUpgradeReject(t *testing.T, srv *httptest.Server, opts qwpTransportOpts) *QwpUpgradeRejectError { + t.Helper() + if opts.endpointPath == "" { + opts.endpointPath = qwpWritePath + } + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + err := tr.connect(context.Background(), wsURL, opts) + require.Error(t, err) + assert.Nil(t, tr.conn, "transport must not retain a conn on a rejected upgrade") + var rej *QwpUpgradeRejectError + require.ErrorAs(t, err, &rej) + return rej +} + +// TestQwpTransportUpgradeReject421PrimaryCatchup verifies that a 421 +// response with X-QuestDB-Role: PRIMARY_CATCHUP surfaces as a typed +// QwpUpgradeRejectError that classifies as a (transient) role-reject. +func TestQwpTransportUpgradeReject421PrimaryCatchup(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + "X-QuestDB-Zone": []string{"eu-west-1a"}, + }, "primary is still catching up") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 421, rej.StatusCode) + assert.Equal(t, "PRIMARY_CATCHUP", rej.Role) + assert.Equal(t, "eu-west-1a", rej.Zone) + assert.True(t, rej.IsRoleReject()) + assert.True(t, rej.IsCatchupRole()) + assert.Contains(t, rej.Body, "catching up") +} + +// TestQwpTransportUpgradeReject421Replica verifies that a 421 with a +// non-CATCHUP role surfaces as a topology-style reject (IsRoleReject +// is true but IsCatchupRole is false). +func TestQwpTransportUpgradeReject421Replica(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"REPLICA"}, + }, "") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 421, rej.StatusCode) + assert.Equal(t, "REPLICA", rej.Role) + assert.True(t, rej.IsRoleReject()) + assert.False(t, rej.IsCatchupRole()) +} + +// TestQwpTransportUpgradeReject421CaseInsensitiveRole verifies the +// PRIMARY_CATCHUP comparison is case-insensitive — failover.md §5 +// mandates case-insensitive matching for the PRIMARY_CATCHUP and +// REPLICA predicates. +func TestQwpTransportUpgradeReject421CaseInsensitiveRole(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"primary_catchup"}, + }, "") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.True(t, rej.IsCatchupRole(), + "PRIMARY_CATCHUP match must be case-insensitive (got %q)", rej.Role) +} + +// TestQwpTransportUpgradeReject421WithoutRole exercises the "421 + no +// role header" path: spec §5 says this degrades to a generic transport +// error from the failover loop's perspective. The transport surfaces +// the typed reject; classification is the caller's responsibility. +func TestQwpTransportUpgradeReject421WithoutRole(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{}, "missing role header") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 421, rej.StatusCode) + assert.Empty(t, rej.Role) + assert.False(t, rej.IsRoleReject(), "421 with empty role must not classify as role-reject") +} + +// TestQwpTransportUpgradeReject404 — 404 was previously terminal for +// SF (qwpSfIsProtocolUpgradeFailure matched "got 404"); per the +// 2026-05-08 reclassification, it now flows through the round-walk as +// transient. The transport just surfaces the typed reject. +func TestQwpTransportUpgradeReject404(t *testing.T) { + srv := newUpgradeRejectServer(t, 404, http.Header{}, "not found") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 404, rej.StatusCode) + assert.False(t, rej.IsRoleReject()) +} + +// TestQwpTransportUpgradeReject426 — same reasoning as 404 (rolling +// upgrade with one peer on a newer/older version). +func TestQwpTransportUpgradeReject426(t *testing.T) { + srv := newUpgradeRejectServer(t, 426, http.Header{}, "upgrade required") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 426, rej.StatusCode) +} + +// TestQwpTransportUpgradeReject503 — server reachable but currently +// unable to serve. failover.md §6 classifies this as transient. +func TestQwpTransportUpgradeReject503(t *testing.T) { + srv := newUpgradeRejectServer(t, 503, http.Header{ + "Retry-After": []string{"7"}, + }, "") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 503, rej.StatusCode) + assert.Equal(t, 7*time.Second, rej.RetryAfter) +} + +// TestQwpTransportUpgradeReject401 — auth-terminal at the failover-loop +// layer. The transport again just surfaces the typed reject; the SF +// classifier maps 401/403 to CategorySecurityError separately. +func TestQwpTransportUpgradeReject401(t *testing.T) { + srv := newUpgradeRejectServer(t, 401, http.Header{}, "unauthorized") + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.Equal(t, 401, rej.StatusCode) +} + +// TestQwpTransportUpgradeRejectBodyTruncation verifies the body +// snippet is bounded by qwpUpgradeBodySnippetCap and that overrun +// adds a trailing ellipsis so the truncation is observable. +func TestQwpTransportUpgradeRejectBodyTruncation(t *testing.T) { + body := strings.Repeat("X", qwpUpgradeBodySnippetCap+200) + srv := newUpgradeRejectServer(t, 500, http.Header{}, body) + defer srv.Close() + + rej := connectUpgradeReject(t, srv, qwpTransportOpts{}) + assert.LessOrEqual(t, len(rej.Body), qwpUpgradeBodySnippetCap+len("…")) + assert.True(t, strings.HasSuffix(rej.Body, "…"), + "truncated body must end with ellipsis, got %q", rej.Body) +} + +// TestQwpTransportUpgradeRejectErrorIsTyped pins down the +// errors.As contract so failover loop callers can rely on +// `var rej *QwpUpgradeRejectError; errors.As(err, &rej)` after a +// failed connect — even if the transport wraps the error in the +// future. +func TestQwpTransportUpgradeRejectErrorIsTyped(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }, "") + defer srv.Close() + + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + var tr qwpTransport + err := tr.connect(context.Background(), wsURL, qwpTransportOpts{endpointPath: qwpWritePath}) + require.Error(t, err) + var rej *QwpUpgradeRejectError + require.ErrorAs(t, err, &rej) + assert.Equal(t, 421, rej.StatusCode) + // The originating websocket.Dial error is wrapped, not discarded. + assert.NotNil(t, errors.Unwrap(rej)) +} + +// TestQwpUpgradeRejectErrorWrapsCause pins that a non-101 upgrade +// reject retains the originating dial error instead of discarding it. +// The degenerate case is a 101 status — the HTTP handshake completed +// but the WebSocket upgrade still failed (e.g. a bad +// Sec-WebSocket-Accept); "rejected with HTTP 101" alone is misleading, +// so the cause must survive in both the Unwrap chain and the message. +func TestQwpUpgradeRejectErrorWrapsCause(t *testing.T) { + cause := errors.New("bad Sec-WebSocket-Accept") + rej := buildUpgradeRejectError(&http.Response{ + StatusCode: 101, + Header: http.Header{}, + }, cause) + require.ErrorIs(t, rej, cause) + assert.Equal(t, cause, errors.Unwrap(rej)) + assert.Contains(t, rej.Error(), "bad Sec-WebSocket-Accept") + assert.Contains(t, rej.Error(), "101") + + // A normal non-101 reject keeps a clean, status-led message: the + // cause stays reachable via Unwrap (so errors.Is works), but is not + // appended to the human message the failover budget report quotes. + clean := buildUpgradeRejectError(&http.Response{ + StatusCode: 421, + Header: http.Header{"X-QuestDB-Role": []string{"PRIMARY"}}, + }, errors.New("expected handshake response status code 101 but got 421")) + assert.NotContains(t, clean.Error(), "expected handshake response status code") + assert.NotNil(t, errors.Unwrap(clean)) +} + +// TestQwpTransportUpgradeRejectNoConnLeak drives many non-101 upgrade +// rejects through connect() and asserts the goroutine count stays flat. +// Each connect() builds a fresh one-shot http.Transport; without +// DisableKeepAlives a 421 (steady-state in failover topologies) would +// park the keep-alive TCP conn in that transport's idle pool, stranding +// the conn plus its persistConn read/write goroutines — a per-reject +// leak invisible to the single-shot reject tests above, since each of +// them builds exactly one transport. +func TestQwpTransportUpgradeRejectNoConnLeak(t *testing.T) { + srv := newUpgradeRejectServer(t, 421, http.Header{ + "X-QuestDB-Role": []string{"PRIMARY_CATCHUP"}, + }, "primary is still catching up") + defer srv.Close() + + // Warm-up cycle so the httptest accept machinery and any + // once-initialized globals are already counted in the baseline. + connectUpgradeReject(t, srv, qwpTransportOpts{}) + base := stableGoroutineCount() + + const cycles = 30 + for i := 0; i < cycles; i++ { + connectUpgradeReject(t, srv, qwpTransportOpts{}) + } + + // persistConn teardown is asynchronous — the read/write goroutines + // exit once the closed conn unblocks them — so let it settle. A + // per-reject leak would add ~2×30 goroutines, far past the slack, so + // this stays sensitive without flaking on transient runtime or + // httptest server goroutines. + const slack = 8 + var got int + require.Eventuallyf(t, func() bool { + got = stableGoroutineCount() + return got <= base+slack + }, 10*time.Second, 100*time.Millisecond, + "goroutine count did not return to baseline after %d upgrade-reject "+ + "connect cycles", cycles) + assert.LessOrEqualf(t, got, base+slack, + "goroutine count grew from %d to %d across %d upgrade rejects — "+ + "connect() is leaking pooled conns / persistConn goroutines", + base, got, cycles) +} + +// TestQwpTransportAuthTimeoutBoundsUpgradeReadOnly verifies that the +// failover.md §1 auth_timeout_ms knob only bounds the upgrade response +// read — a server that accepts the TCP connection but never writes the +// HTTP response must trip the timeout, and the resulting error must +// surface within the configured window (not the OS default connect +// timeout). +func TestQwpTransportAuthTimeoutBoundsUpgradeReadOnly(t *testing.T) { + // Black-hole acceptor: accept the TCP connection but never send a + // response. coder/websocket's Dial will block on response read. + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer ln.Close() + go func() { + for { + conn, err := ln.Accept() + if err != nil { + return + } + // Hold the connection open without responding. + _ = conn + } + }() + + start := time.Now() + wsURL := "ws://" + ln.Addr().String() + var tr qwpTransport + err = tr.connect(context.Background(), wsURL, qwpTransportOpts{ + endpointPath: qwpWritePath, + authTimeoutMs: 200, + }) + elapsed := time.Since(start) + + require.Error(t, err) + // Should fire close to the configured 200ms — well under any OS + // connect default. Allow generous headroom for slow CI. + assert.Less(t, elapsed, 2*time.Second, + "auth_timeout_ms (200ms) did not bound the upgrade read; elapsed=%s", elapsed) +} diff --git a/qwp_wire.go b/qwp_wire.go index d8c8a2a9..07f6baab 100644 --- a/qwp_wire.go +++ b/qwp_wire.go @@ -196,6 +196,13 @@ func qwpPutVarint(buf []byte, v uint64) int { // qwpReadVarint decodes an unsigned LEB128 varint from buf. It returns // the decoded value and the number of bytes consumed, or an error if // the varint is malformed or truncated. +// +// The byte-10 guard rejects payloads where a 10th byte contributes data +// bits beyond bit 63 of the result. Without it, a hostile server varint +// whose final byte sets any of bits 1..6 would silently overflow uint64 +// via the shift below, producing a wildly wrong value the caller cannot +// distinguish from a legitimate one. Mirrors the Java reference decoder +// guard in QwpResultBatchDecoder.decodeVarint. func qwpReadVarint(buf []byte) (uint64, int, error) { var v uint64 var shift uint @@ -203,6 +210,9 @@ func qwpReadVarint(buf []byte) (uint64, int, error) { if i >= qwpMaxVarintLen { return 0, 0, errors.New("qwp: varint overflow") } + if shift == 63 && b&0x7E != 0 { + return 0, 0, errors.New("qwp: varint overflow") + } v |= uint64(b&0x7F) << shift if b&0x80 == 0 { return v, i + 1, nil @@ -227,3 +237,166 @@ func qwpVarintSize(v uint64) int { func qwpStringSize(s string) int { return qwpVarintSize(uint64(len(s))) + len(s) } + +// qwpDecodeError is the sentinel error type returned by decode paths +// (qwpByteReader, Gorilla decoder, RESULT_BATCH decoder). Dedicated type +// so callers can distinguish decode failures from transport / framing +// errors via errors.As, without regex-ing the message. The optional +// `cause` field carries the underlying error (if any) so errors.Is / +// errors.As can reach through to its identity. +type qwpDecodeError struct { + msg string + cause error +} + +func (e *qwpDecodeError) Error() string { + return "qwp: decode: " + e.msg +} + +func (e *qwpDecodeError) Unwrap() error { + return e.cause +} + +func newQwpDecodeError(msg string) *qwpDecodeError { + return &qwpDecodeError{msg: msg} +} + +func wrapQwpDecodeError(msg string, cause error) *qwpDecodeError { + return &qwpDecodeError{msg: msg, cause: cause} +} + +// qwpByteReader is a position-tracking reader over a QWP frame payload. +// Produced typed-value errors are always *qwpDecodeError; truncation, +// overflow, and out-of-range inputs all bubble up as a single error +// class so the hot path can stay branch-light. +// +// The reader aliases its input: slice(n) returns a sub-slice of buf, so +// the caller must not retain returned slices past the frame's lifetime. +// In the QWP egress model the WebSocket recv buffer stays pinned while +// the user's range iteration runs; once it returns, slices derived from +// this reader are no longer valid. +type qwpByteReader struct { + buf []byte + pos int +} + +// reset rebinds the reader to a new buffer and rewinds pos to zero. +func (r *qwpByteReader) reset(buf []byte) { + r.buf = buf + r.pos = 0 +} + +// remaining returns the count of unread bytes. +func (r *qwpByteReader) remaining() int { return len(r.buf) - r.pos } + +// atEnd reports whether the reader has consumed every byte. +func (r *qwpByteReader) atEnd() bool { return r.pos >= len(r.buf) } + +// readByte reads one byte. +func (r *qwpByteReader) readByte() (byte, error) { + if r.pos >= len(r.buf) { + return 0, newQwpDecodeError("unexpected end of buffer reading uint8") + } + b := r.buf[r.pos] + r.pos++ + return b, nil +} + +// readUint16LE reads a little-endian uint16. +func (r *qwpByteReader) readUint16LE() (uint16, error) { + if r.pos+2 > len(r.buf) { + return 0, newQwpDecodeError("unexpected end of buffer reading uint16") + } + v := binary.LittleEndian.Uint16(r.buf[r.pos:]) + r.pos += 2 + return v, nil +} + +// readUint32LE reads a little-endian uint32. +func (r *qwpByteReader) readUint32LE() (uint32, error) { + if r.pos+4 > len(r.buf) { + return 0, newQwpDecodeError("unexpected end of buffer reading uint32") + } + v := binary.LittleEndian.Uint32(r.buf[r.pos:]) + r.pos += 4 + return v, nil +} + +// readInt32LE reads a little-endian int32. +func (r *qwpByteReader) readInt32LE() (int32, error) { + u, err := r.readUint32LE() + return int32(u), err +} + +// readUint64LE reads a little-endian uint64. +func (r *qwpByteReader) readUint64LE() (uint64, error) { + if r.pos+8 > len(r.buf) { + return 0, newQwpDecodeError("unexpected end of buffer reading uint64") + } + v := binary.LittleEndian.Uint64(r.buf[r.pos:]) + r.pos += 8 + return v, nil +} + +// readInt64LE reads a little-endian int64. +func (r *qwpByteReader) readInt64LE() (int64, error) { + u, err := r.readUint64LE() + return int64(u), err +} + +// readFloat64LE reads an IEEE 754 little-endian float64. +func (r *qwpByteReader) readFloat64LE() (float64, error) { + u, err := r.readUint64LE() + return math.Float64frombits(u), err +} + +// readVarint reads an unsigned LEB128 varint, surfacing the existing +// overflow / truncation errors from qwpReadVarint as *qwpDecodeError +// while preserving the underlying error via Unwrap. +func (r *qwpByteReader) readVarint() (uint64, error) { + v, n, err := qwpReadVarint(r.buf[r.pos:]) + if err != nil { + return 0, wrapQwpDecodeError(err.Error(), err) + } + r.pos += n + return v, nil +} + +// readVarintInt63 reads an unsigned varint and rejects values where the +// uint64→int64 cast would flip the sign. Used for varint-encoded fields +// that the wire spec treats as non-negative int63 (row count, column +// count, name lengths, etc.). Without this check, a hostile varint can +// drive a length past the bound check via two's-complement arithmetic +// — see QwpResultBatchDecoder.java around row_count and col_count. +func (r *qwpByteReader) readVarintInt63() (int64, error) { + v, err := r.readVarint() + if err != nil { + return 0, err + } + if v > uint64(1<<63-1) { + return 0, newQwpDecodeError("varint overflow: value exceeds int63") + } + return int64(v), nil +} + +// advance skips n bytes. Errors when fewer than n bytes remain. +func (r *qwpByteReader) advance(n int) error { + if n < 0 || r.pos+n > len(r.buf) { + return newQwpDecodeError("unexpected end of buffer while advancing") + } + r.pos += n + return nil +} + +// slice returns a sub-slice of the underlying buffer covering the next +// n bytes and advances pos. The returned slice aliases the input — do +// not retain it past the frame's lifetime. Errors when fewer than n +// bytes remain. +func (r *qwpByteReader) slice(n int) ([]byte, error) { + if n < 0 || r.pos+n > len(r.buf) { + return nil, newQwpDecodeError("unexpected end of buffer while slicing") + } + s := r.buf[r.pos : r.pos+n] + r.pos += n + return s, nil +} diff --git a/qwp_wire_test.go b/qwp_wire_test.go index 3b48fc04..c9ceecd6 100644 --- a/qwp_wire_test.go +++ b/qwp_wire_test.go @@ -256,6 +256,193 @@ func TestQwpVarintDecodeErrors(t *testing.T) { _, _, err = qwpReadVarint(overflow) assert.Error(t, err) assert.Contains(t, err.Error(), "overflow") + + // Byte-10 guard: 10th byte (shift=63) may only contribute bit 0. + // Any of data bits 1..6 set means the decoded value would silently + // overflow uint64 via the shift. Mirrors the Java byte-10 guard in + // QwpResultBatchDecoder.decodeVarint. + // + // Hostile encoding: 9 continuation bytes + 0x40 (sets bit 62 of byte 10, + // i.e. bit 125 of the value — pure garbage, must be rejected). + bit62 := []byte{0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x40} + _, _, err = qwpReadVarint(bit62) + assert.Error(t, err) + assert.Contains(t, err.Error(), "overflow") + + // Hostile encoding: 9 continuation bytes + 0x02 (sets bit 64 of the + // value — exactly one bit past uint64 range; the shift would discard + // it silently without the guard). + bit64 := []byte{0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02} + _, _, err = qwpReadVarint(bit64) + assert.Error(t, err) + assert.Contains(t, err.Error(), "overflow") + + // Sanity: the in-range byte-10 pattern (bit 63 set, encoding 1<<63) + // is NOT rejected by the guard — it's a valid uint64. + inRange := []byte{0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01} + v, n, err := qwpReadVarint(inRange) + assert.NoError(t, err) + assert.Equal(t, uint64(1)<<63, v) + assert.Equal(t, 10, n) +} + +// --- qwpByteReader --- + +func TestQwpByteReaderHappyPath(t *testing.T) { + // Build a mixed-type buffer, then read back. + var w qwpWireBuffer + w.putByte(0x42) + w.putUint16LE(0x1234) + w.putUint32LE(0xDEADBEEF) + w.putInt32LE(-7) + w.putUint64LE(0x0102030405060708) + w.putInt64LE(-42) + w.putFloat64LE(3.5) + w.putVarint(300) + w.putBytes([]byte{0xCA, 0xFE, 0xBA, 0xBE}) + + var r qwpByteReader + r.reset(w.bytes()) + + assert.Equal(t, len(w.bytes()), r.remaining()) + + b, err := r.readByte() + assert.NoError(t, err) + assert.Equal(t, byte(0x42), b) + + u16, err := r.readUint16LE() + assert.NoError(t, err) + assert.Equal(t, uint16(0x1234), u16) + + u32, err := r.readUint32LE() + assert.NoError(t, err) + assert.Equal(t, uint32(0xDEADBEEF), u32) + + i32, err := r.readInt32LE() + assert.NoError(t, err) + assert.Equal(t, int32(-7), i32) + + u64, err := r.readUint64LE() + assert.NoError(t, err) + assert.Equal(t, uint64(0x0102030405060708), u64) + + i64, err := r.readInt64LE() + assert.NoError(t, err) + assert.Equal(t, int64(-42), i64) + + f64, err := r.readFloat64LE() + assert.NoError(t, err) + assert.Equal(t, 3.5, f64) + + varint, err := r.readVarint() + assert.NoError(t, err) + assert.Equal(t, uint64(300), varint) + + tail, err := r.slice(4) + assert.NoError(t, err) + assert.Equal(t, []byte{0xCA, 0xFE, 0xBA, 0xBE}, tail) + + assert.True(t, r.atEnd()) + assert.Equal(t, 0, r.remaining()) +} + +func TestQwpByteReaderTruncatedAtEveryReader(t *testing.T) { + // For each typed reader, supply a buffer one byte short and assert + // the read errors instead of reading past the end. + cases := []struct { + name string + buf []byte + fn func(*qwpByteReader) error + }{ + {"readByte", []byte{}, func(r *qwpByteReader) error { _, err := r.readByte(); return err }}, + {"readUint16LE", []byte{0x01}, func(r *qwpByteReader) error { _, err := r.readUint16LE(); return err }}, + {"readUint32LE", []byte{0x01, 0x02, 0x03}, func(r *qwpByteReader) error { _, err := r.readUint32LE(); return err }}, + {"readInt32LE", []byte{0x01, 0x02, 0x03}, func(r *qwpByteReader) error { _, err := r.readInt32LE(); return err }}, + {"readUint64LE", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}, func(r *qwpByteReader) error { _, err := r.readUint64LE(); return err }}, + {"readInt64LE", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}, func(r *qwpByteReader) error { _, err := r.readInt64LE(); return err }}, + {"readFloat64LE", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}, func(r *qwpByteReader) error { _, err := r.readFloat64LE(); return err }}, + {"readVarint_truncated", []byte{0x80}, func(r *qwpByteReader) error { _, err := r.readVarint(); return err }}, + {"slice_past_end", []byte{0x01}, func(r *qwpByteReader) error { _, err := r.slice(2); return err }}, + {"advance_past_end", []byte{0x01}, func(r *qwpByteReader) error { return r.advance(2) }}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + var r qwpByteReader + r.reset(c.buf) + err := c.fn(&r) + assert.Error(t, err) + var decodeErr *qwpDecodeError + assert.ErrorAs(t, err, &decodeErr) + }) + } +} + +func TestQwpByteReaderVarintInt63RejectsSignBit(t *testing.T) { + // Varint for 1<<63 (one past int64.MaxValue). The uint64 decoder + // accepts it; readVarintInt63 must reject it as overflowing the + // signed int63 range used by length / count / id fields on the wire. + var w qwpWireBuffer + w.putVarint(uint64(1) << 63) + + var r qwpByteReader + r.reset(w.bytes()) + + _, err := r.readVarintInt63() + assert.Error(t, err) + assert.Contains(t, err.Error(), "int63") + + // Sanity: math.MaxInt64 fits and round-trips. + w.reset() + w.putVarint(uint64(math.MaxInt64)) + r.reset(w.bytes()) + v, err := r.readVarintInt63() + assert.NoError(t, err) + assert.Equal(t, int64(math.MaxInt64), v) +} + +func TestQwpByteReaderAdvanceAndSlice(t *testing.T) { + buf := []byte{1, 2, 3, 4, 5, 6} + var r qwpByteReader + r.reset(buf) + + assert.NoError(t, r.advance(2)) + assert.Equal(t, 4, r.remaining()) + + s, err := r.slice(2) + assert.NoError(t, err) + assert.Equal(t, []byte{3, 4}, s) + + // Slice aliases the input — mutating the source surfaces in the view. + buf[2] = 0xEE + assert.Equal(t, byte(0xEE), s[0]) + + // Negative n rejected. + assert.Error(t, r.advance(-1)) + _, err = r.slice(-1) + assert.Error(t, err) + + // Running off the end errors. + assert.Error(t, r.advance(10)) +} + +func TestQwpByteReaderZeroAlloc(t *testing.T) { + // Hot-path reads must not allocate. This pins the contract that the + // decoder (Step 4) relies on to meet the zero-alloc invariant. + buf := make([]byte, 64) + for i := range buf { + buf[i] = byte(i) + } + var r qwpByteReader + + allocs := testing.AllocsPerRun(100, func() { + r.reset(buf) + _, _ = r.readByte() + _, _ = r.readUint32LE() + _, _ = r.readInt64LE() + _, _ = r.readFloat64LE() + _, _ = r.slice(4) + }) + assert.Equal(t, float64(0), allocs, "qwpByteReader hot path must not allocate") } func TestQwpStringSize(t *testing.T) { diff --git a/sender.go b/sender.go index 0a6dc2dc..5c1c7094 100644 --- a/sender.go +++ b/sender.go @@ -287,6 +287,33 @@ const ( ProtocolVersion3 protocolVersion = 3 ) +// InitialConnectMode controls how the QWP sender treats failures of +// its very first connect attempt. Mirrors the Java client's +// `initial_connect_retry` enum. +type InitialConnectMode byte + +const ( + // InitialConnectOff (the default) makes any failure on the first + // connect terminal — typically a misconfig, retrying just hides + // it. The constructor surfaces the dial error directly. + InitialConnectOff InitialConnectMode = iota + // InitialConnectSync runs the same retry-with-backoff loop as + // reconnect on the calling goroutine, blocking the constructor + // until either the connection comes up or the reconnect budget + // (reconnect_max_duration_millis) is exhausted. Auth/upgrade + // failures stay terminal. + InitialConnectSync + // InitialConnectAsync defers the dial to the I/O goroutine and + // returns from the constructor immediately with an unconnected + // sender. The producer goroutine can call Table()/At()/Flush() + // right away; rows accumulate in the cursor SF engine until the + // connection comes up. Connect-budget exhaustion or terminal + // upgrade failure is delivered through the configured + // SenderErrorHandler (and surfaced from any subsequent producer + // API call as a typed error). + InitialConnectAsync +) + type lineSenderConfig struct { senderType senderType address string @@ -295,6 +322,17 @@ type lineSenderConfig struct { fileNameLimit int httpTransport *http.Transport + // Multi-host failover (failover.md §1 / §2). For QWP, sanitizeQwpConf + // populates endpoints from address (which may be a comma-joined + // list); downstream consumers walk endpoints rather than address. + // Non-QWP transports leave endpoints nil and continue using address + // directly — sanitizeHttp/sanitizeTcp reject comma-form addr at + // validation time since neither transport supports multi-host yet. + endpoints []qwpEndpoint + authTimeoutMs int // QWP-only; 0 -> 15000 (15s) at sanitize time + zone string // QWP-only; honoured on egress, inert on ingest (no zone routing) + target QwpTargetFilter // QWP-only; zero value = QwpTargetAny + // Retry/timeout-related fields retryTimeout time.Duration minThroughput int @@ -312,15 +350,57 @@ type lineSenderConfig struct { autoFlushRows int autoFlushInterval time.Duration autoFlushBytes int // QWP-only; 0 disables the byte-size trigger + // autoFlushBytesSet records whether the user explicitly set + // auto_flush_bytes (vs. the seeded qwpDefaultAutoFlushBytes). + // sanitizeQwpConf uses it to reject only a user-written + // auto_flush_bytes > sf_max_bytes contradiction; a defaulted trigger + // over a smaller user-chosen segment is left for the runtime clamp. + autoFlushBytesSet bool protocolVersion protocolVersion // QWP-specific fields - inFlightWindow int // 0 = unset (treated as sync mode 1); seeded to qwpDefaultInFlightWindow by newLineSenderConfig - closeTimeout time.Duration // 0 = use default (5s) - maxSchemasPerConnection int // 0 = unset; seeded to qwpDefaultMaxSchemasPerConnection - dumpWriter io.Writer // if set, record outgoing bytes (unexported) - gorillaDisabled bool // false (default) = Gorilla timestamp encoding enabled + inFlightWindow int // retained for config compatibility; a no-op in the cursor architecture (see WithInFlightWindow). Seeded to qwpDefaultInFlightWindow by newLineSenderConfig + dumpWriter io.Writer // if set, record outgoing bytes (unexported) + gorillaDisabled bool // false (default) = Gorilla timestamp encoding enabled + + // QWP store-and-forward (cursor) fields. Setting sfDir selects + // disk-backed segments: flushed batches are persisted to mmap'd + // files under // and the send loop replays from + // disk on reconnect / restart. When sfDir is empty, segments are + // memory-backed; both modes run on the same cursor engine + send + // loop. + sfDir string + senderId string // empty -> "default" at construction + sfMaxBytes int64 // per-segment size (bytes); 0 -> 4 MiB + sfMaxTotalBytes int64 // total cap (bytes); 0 -> 10 GiB + sfDurability string // empty / "memory" only; reserved future "flush" / "append" + sfAppendDeadlineMillis int // 0 -> 30000 + reconnectMaxDurationMillis int // 0 -> 300000 (5 min) + reconnectInitialBackoffMillis int // 0 -> 100 + reconnectMaxBackoffMillis int // 0 -> 5000 + // Per-key explicit-set flags for the three reconnect_* knobs. + // Used by sanitizeQwpConf to implement the implicit promotion of + // initial_connect_retry to "on" when the user tuned any reconnect + // budget without choosing a connect mode (matches Java's behaviour + // — see Sender.java's actualInitialConnectMode resolution). + reconnectMaxDurationMillisSet bool + reconnectInitialBackoffMillisSet bool + reconnectMaxBackoffMillisSet bool + initialConnectMode InitialConnectMode // default InitialConnectOff + initialConnectModeSet bool // true if user explicitly chose a mode (gates the reconnect_*-driven promotion) + closeFlushTimeoutMillis int // 0 -> 5000; -1 / negative -> fast close (skip drain) + closeFlushTimeoutSet bool // true if user explicitly set the value (so 0 means "fast close" rather than "use default") + drainOrphans bool // default false (Phase 6) + maxBackgroundDrainers int // 0 -> 4 (Phase 6) + + // QWP server-error API (Phase 5). All fields are QWP-only. + errorHandler SenderErrorHandler // nil -> default loud handler + errorPolicyResolver func(Category) Policy // nil -> per-category map / global / spec defaults + errorPolicyPerCat [numCategories]Policy // PolicyAuto = unset; cleared at construction + errorPolicyPerCatSet bool // tracks whether *any* per-category override was set + errorPolicyGlobal Policy // PolicyAuto = unset + errorInboxCapacity int // 0 -> qwpSfDefaultErrorInboxCapacity; sanitizer floors at qwpSfMinErrorInboxCapacity } // LineSenderOption defines line sender config option. @@ -347,39 +427,242 @@ func WithQwp() LineSenderOption { } } -// WithInFlightWindow sets the number of concurrent in-flight batches -// for async QWP mode. A value of 1 forces synchronous mode (each -// Flush blocks until the ACK arrives). Values > 1 enable async mode -// with a dedicated I/O goroutine. Defaults to 128. +// WithInFlightWindow is retained for backward compatibility but is a +// no-op. In the QWP cursor architecture, backpressure is governed by +// the engine's segment ring and the append deadline, not by a fixed +// in-flight batch count. Flush never waits for the server ACK, so +// there is no synchronous mode to opt into. Connect strings carrying +// in_flight_window still parse; the value is ignored. // // Only available for the QWP sender. +// +// Deprecated: the in-flight window has no effect and there is no +// replacement — backpressure is automatic. To confirm server ACKs, +// pair FlushAndGetSequence with AwaitAckedFsn. func WithInFlightWindow(window int) LineSenderOption { return func(s *lineSenderConfig) { s.inFlightWindow = window } } -// WithCloseTimeout sets the time Close() waits for the async I/O -// goroutine to finish before force-cancelling. Defaults to 5 seconds. -// Calling Flush() before Close() guarantees all data is ACKed -// regardless of this timeout. +// WithCloseTimeout sets the time Close() waits for the I/O goroutine +// to finish draining published batches to the server before +// force-cancelling. Defaults to 5 seconds. Because Flush() never waits +// for the server ACK, this close-time drain — not Flush() — is the +// sender's last chance to get buffered data confirmed; rows still +// unacked when the timeout expires may be lost (memory mode) or left +// on disk for replay (store-and-forward). // -// Only relevant for the QWP sender in async mode (in-flight window > 1). +// Deprecated: use WithCloseFlushTimeout instead. WithCloseTimeout is +// preserved as an alias so v4.0–v4.5 code keeps compiling — it +// routes through the same close_flush_timeout_millis path the spec +// (connect-string.md §Ingress reconnect) defines. d <= 0 is treated +// as "no override" (default 5s) to match the legacy semantics; to +// skip the drain entirely, use WithCloseFlushTimeout, where 0 / +// negative means "fast close". func WithCloseTimeout(d time.Duration) LineSenderOption { return func(s *lineSenderConfig) { - s.closeTimeout = d + if d >= time.Millisecond { + s.closeFlushTimeoutSet = true + s.closeFlushTimeoutMillis = int(d / time.Millisecond) + } + } +} + +// WithErrorHandler registers a callback invoked asynchronously when +// the SF send loop observes a server-side batch rejection. The +// handler runs on a dedicated dispatcher goroutine; slow handlers +// cannot stall publishing. If the bounded inbox fills up, surplus +// notifications are dropped (visible via +// QwpSender.DroppedErrorNotifications()). +// +// Passing nil reverts to the default loud-not-silent handler that +// logs ERROR for HALT and WARN for DROP. +// +// The handler may call Close() or Flush() on the sender (e.g. to shut +// down on a HALT) without deadlocking — see SenderErrorHandler for the +// re-entrancy contract. +// +// Only available for the QWP sender. +func WithErrorHandler(h SenderErrorHandler) LineSenderOption { + return func(s *lineSenderConfig) { + s.errorHandler = h + } +} + +// WithErrorPolicy sets the Policy applied for one Category. Per- +// category overrides take precedence over the connect-string global +// on_server_error and the spec defaults; a programmatic resolver +// registered via WithErrorPolicyResolver still wins over both. +// +// PolicyAuto removes any prior override (falls through to next +// layer). CategoryProtocolViolation and CategoryUnknown are forced +// HALT regardless of this setting. +// +// Only available for the QWP sender. +func WithErrorPolicy(c Category, p Policy) LineSenderOption { + return func(s *lineSenderConfig) { + if int(c) >= len(s.errorPolicyPerCat) { + return + } + s.errorPolicyPerCat[c] = p + s.errorPolicyPerCatSet = false + for _, q := range s.errorPolicyPerCat { + if q != PolicyAuto { + s.errorPolicyPerCatSet = true + break + } + } + } +} + +// WithErrorPolicyResolver registers a programmatic resolver invoked +// for every Category before any per-category map or global default. +// Returning PolicyAuto from the resolver falls through to the next +// layer (per-category map, then global, then spec default). +// +// CategoryProtocolViolation and CategoryUnknown are forced HALT and +// bypass the resolver entirely. +// +// Only available for the QWP sender. +func WithErrorPolicyResolver(r func(Category) Policy) LineSenderOption { + return func(s *lineSenderConfig) { + s.errorPolicyResolver = r + } +} + +// WithErrorInboxCapacity sets the size of the bounded inbox between +// the I/O goroutine and the dispatcher goroutine. Larger values +// tolerate slower handlers at the cost of memory; smaller values +// surface backpressure (drop counter) sooner. Defaults to 256; +// minimum is 16 (sanitized at construction). +// +// Only available for the QWP sender. +func WithErrorInboxCapacity(n int) LineSenderOption { + return func(s *lineSenderConfig) { + s.errorInboxCapacity = n + } +} + +// WithSfDir activates the store-and-forward cursor path against +// the given group root. The sender's slot lives at +// `//`; flushed batches are persisted there and +// replayed on reconnect / restart. Setting an empty string is a +// no-op (memory mode). +// +// Only available for the QWP sender. +func WithSfDir(dir string) LineSenderOption { + return func(s *lineSenderConfig) { + s.sfDir = dir } } -// WithMaxSchemasPerConnection caps the number of schema IDs that may -// be registered on a single QWP connection before the sender returns -// an error. Once the cap is hit, the caller should close and re-open -// the sender to start a new schema ID space. Defaults to 65535. +// WithSenderId sets the sub-directory name under sfDir that +// uniquely identifies this sender's slot. Defaults to "default"; +// multi-sender deployments must set distinct IDs to avoid lock +// collisions on the same slot. Only meaningful when sf_dir is set. // // Only available for the QWP sender. -func WithMaxSchemasPerConnection(n int) LineSenderOption { +func WithSenderId(id string) LineSenderOption { return func(s *lineSenderConfig) { - s.maxSchemasPerConnection = n + s.senderId = id + } +} + +// WithSfMaxBytes sets the per-segment cap (bytes) for the cursor +// engine. Defaults to 4 MiB. Lower values rotate segments more +// aggressively; higher values amortize the rotation overhead. +// +// Only available for the QWP sender. +func WithSfMaxBytes(n int64) LineSenderOption { + return func(s *lineSenderConfig) { + s.sfMaxBytes = n + } +} + +// WithSfMaxTotalBytes caps the total cursor allocation (active + +// hot spare + sealed segments) for this sender. The producer is +// backpressured when an append would exceed the cap. Defaults to +// 10 GiB. +// +// Only available for the QWP sender. +func WithSfMaxTotalBytes(n int64) LineSenderOption { + return func(s *lineSenderConfig) { + s.sfMaxTotalBytes = n + } +} + +// WithReconnectPolicy configures the per-outage reconnect cap and +// backoff policy. maxDuration bounds the total time spent +// reconnecting before the loop gives up; initialBackoff and +// maxBackoff bound a backoff sleep between attempts (with jitter). +// A zero or negative argument is treated as "leave the default" for +// that knob — it does not register as an explicit user choice and so +// does not trigger the initial_connect_retry promotion. +// +// Only available for the QWP sender. +func WithReconnectPolicy(maxDuration, initialBackoff, maxBackoff time.Duration) LineSenderOption { + return func(s *lineSenderConfig) { + if maxDuration > 0 { + s.reconnectMaxDurationMillis = int(maxDuration / time.Millisecond) + s.reconnectMaxDurationMillisSet = true + } + if initialBackoff > 0 { + s.reconnectInitialBackoffMillis = int(initialBackoff / time.Millisecond) + s.reconnectInitialBackoffMillisSet = true + } + if maxBackoff > 0 { + s.reconnectMaxBackoffMillis = int(maxBackoff / time.Millisecond) + s.reconnectMaxBackoffMillisSet = true + } + } +} + +// WithInitialConnectRetry, when true, applies the same +// retry-with-backoff policy to the initial connect attempt as is +// applied on reconnect. By default an initial connect failure is +// terminal — useful for catching misconfig early. +// +// Equivalent to WithInitialConnectMode(InitialConnectSync) when +// retry is true, or WithInitialConnectMode(InitialConnectOff) when +// retry is false. Use WithInitialConnectMode directly to select +// InitialConnectAsync. +// +// Only available for the QWP sender. +func WithInitialConnectRetry(retry bool) LineSenderOption { + return func(s *lineSenderConfig) { + if retry { + s.initialConnectMode = InitialConnectSync + } else { + s.initialConnectMode = InitialConnectOff + } + s.initialConnectModeSet = true + } +} + +// WithInitialConnectMode configures whether the QWP sender's first +// connection attempt may retry on failure, and if so whether the +// retry runs synchronously on the calling thread or asynchronously +// on the I/O goroutine. See InitialConnectMode for value semantics. +// +// Only available for the QWP sender. +func WithInitialConnectMode(mode InitialConnectMode) LineSenderOption { + return func(s *lineSenderConfig) { + s.initialConnectMode = mode + s.initialConnectModeSet = true + } +} + +// WithCloseFlushTimeout bounds Close()'s wait for the cursor +// engine's ackedFsn to catch up to publishedFsn. A zero or +// negative duration skips the drain entirely (fast close). +// Defaults to 5 seconds. +// +// Only meaningful for the QWP sender in cursor mode (sf_dir set). +func WithCloseFlushTimeout(d time.Duration) LineSenderOption { + return func(s *lineSenderConfig) { + s.closeFlushTimeoutSet = true + s.closeFlushTimeoutMillis = int(d / time.Millisecond) } } @@ -409,6 +692,119 @@ func WithQwpDumpWriter(w io.Writer) LineSenderOption { } } +// WithAuthTimeout bounds how long the QWP transport waits for the +// HTTP-upgrade response (the per-host upper bound from failover.md +// §7). A zero or negative duration falls back to the 15s default at +// construction. Equivalent to the connect-string auth_timeout_ms key. +// +// Only available for the QWP sender. +func WithAuthTimeout(d time.Duration) LineSenderOption { + return func(s *lineSenderConfig) { + s.authTimeoutMs = int(d / time.Millisecond) + } +} + +// WithZone sets the failover zone hint used for endpoint locality. +// It is silently stored but inert on the ingestion path, which is +// zone-blind — it never receives SERVER_INFO. The egress (query) path +// consults it to prefer same-zone endpoints. Equivalent to the +// connect-string zone key. +// +// Only available for the QWP sender. +func WithZone(zone string) LineSenderOption { + return func(s *lineSenderConfig) { + s.zone = zone + } +} + +// WithTarget constrains failover endpoint selection to servers whose +// advertised role passes the filter (QwpTargetAny / QwpTargetPrimary +// / QwpTargetReplica). Defaults to QwpTargetAny. Equivalent to the +// connect-string target=any|primary|replica key. +// +// The filter is honoured on the query (egress) path, which reads the +// server's role from the SERVER_INFO frame. The ingestion path never +// receives SERVER_INFO (it is role-blind by the wire-protocol spec), +// so the value is accepted but inert there — the server's own role +// reject keeps writes off replicas. Symmetric with WithZone. +// +// Only available for the QWP sender. +func WithTarget(target QwpTargetFilter) LineSenderOption { + return func(s *lineSenderConfig) { + s.target = target + } +} + +// WithSfDurability selects the store-and-forward cursor durability +// mode. Only "memory" (the default when unset) is currently honoured; +// "flush" and "append" are reserved for a deferred follow-up and are +// rejected at construction. Requires sf_dir to be set. Equivalent to +// the connect-string sf_durability key. +// +// Only available for the QWP sender. +func WithSfDurability(mode string) LineSenderOption { + return func(s *lineSenderConfig) { + s.sfDurability = mode + } +} + +// WithSfAppendDeadline bounds how long a producer call blocks waiting +// to append a batch into the store-and-forward cursor engine before +// it returns a backpressure error that wraps ErrBackpressureTimeout +// (match with errors.Is). A zero or negative duration falls back to +// the 30s default at construction. Requires sf_dir to be set. +// Equivalent to the connect-string sf_append_deadline_millis key. +// +// Only available for the QWP sender. +func WithSfAppendDeadline(d time.Duration) LineSenderOption { + return func(s *lineSenderConfig) { + s.sfAppendDeadlineMillis = int(d / time.Millisecond) + } +} + +// WithDrainOrphans enables adoption and draining of orphaned +// store-and-forward slots left behind by a crashed or superseded +// sender sharing the same sf_dir group root. Defaults to disabled. +// Requires sf_dir to be set. Equivalent to the connect-string +// drain_orphans key. +// +// Only available for the QWP sender. +func WithDrainOrphans(enabled bool) LineSenderOption { + return func(s *lineSenderConfig) { + s.drainOrphans = enabled + } +} + +// WithMaxBackgroundDrainers caps the number of concurrent +// orphan-drainer goroutines. Defaults to 4. Only meaningful when +// drain_orphans is enabled. Equivalent to the connect-string +// max_background_drainers key. +// +// Only available for the QWP sender. +func WithMaxBackgroundDrainers(n int) LineSenderOption { + return func(s *lineSenderConfig) { + s.maxBackgroundDrainers = n + } +} + +// WithServerErrorPolicy sets the global fallback Policy applied to a +// server-side batch rejection when no higher-precedence layer +// resolves it. Resolution precedence (highest first): the +// WithErrorPolicyResolver resolver → the WithErrorPolicy per-category +// override → the connect-string per-category on_*_error → this global +// policy (connect-string on_server_error) → spec defaults. +// +// PolicyAuto (the zero value) leaves the global layer unset, falling +// through to the spec defaults. CategoryProtocolViolation and +// CategoryUnknown are always HALT regardless of this setting. +// +// Only available for the QWP sender. +func WithServerErrorPolicy(p Policy) LineSenderOption { + return func(s *lineSenderConfig) { + s.errorPolicyGlobal = p + } +} + // WithTls enables TLS connection encryption. func WithTls() LineSenderOption { return func(s *lineSenderConfig) { @@ -510,7 +906,7 @@ func WithMaxBufferSize(sizeInBytes int) LineSenderOption { // WithFileNameLimit sets maximum file name length in chars // allowed by the server. Affects maximum table and column name // lengths accepted by the sender. Should be set to the same value -// as on the server. Defaults to 127. +// as on the server. Must be at least 16. Defaults to 127. func WithFileNameLimit(limit int) LineSenderOption { return func(s *lineSenderConfig) { s.fileNameLimit = limit @@ -590,6 +986,7 @@ func WithAutoFlushInterval(interval time.Duration) LineSenderOption { func WithAutoFlushBytes(bytes int) LineSenderOption { return func(s *lineSenderConfig) { s.autoFlushBytes = bytes + s.autoFlushBytesSet = true } } @@ -733,17 +1130,25 @@ func newLineSenderConfig(t senderType) *lineSenderConfig { fileNameLimit: defaultFileNameLimit, } case qwpSenderType: + // retryTimeout deliberately not seeded for QWP: connect- + // string.md does not list retry_timeout as a QWP key + // (it's HTTP-only), and Sender.java rejects it on the + // WebSocket protocol. Leaving the zero value lets + // sanitizeQwpConf detect "user set it" and reject. + // reconnect_max_duration_millis is the QWP analogue. return &lineSenderConfig{ - senderType: t, - address: defaultHttpAddress, - retryTimeout: defaultRetryTimeout, - autoFlushRows: qwpDefaultAutoFlushRows, - autoFlushInterval: qwpDefaultAutoFlushInterval, - inFlightWindow: qwpDefaultInFlightWindow, - maxSchemasPerConnection: qwpDefaultMaxSchemasPerConnection, - initBufSize: defaultInitBufferSize, - maxBufSize: defaultMaxBufferSize, - fileNameLimit: defaultFileNameLimit, + senderType: t, + address: defaultHttpAddress, + autoFlushRows: qwpDefaultAutoFlushRows, + autoFlushInterval: qwpDefaultAutoFlushInterval, + autoFlushBytes: qwpDefaultAutoFlushBytes, + inFlightWindow: qwpDefaultInFlightWindow, + initBufSize: defaultInitBufferSize, + maxBufSize: defaultMaxBufferSize, + fileNameLimit: defaultFileNameLimit, + // failover.md §7: 15s upper bound on the HTTP upgrade + // response read. Parser overrides on explicit value. + authTimeoutMs: 15_000, } default: return &lineSenderConfig{ @@ -791,6 +1196,9 @@ func sanitizeTcpConf(conf *lineSenderConfig) error { return err } + if strings.Contains(conf.address, ",") { + return errors.New("multi-host addr is not supported for TCP") + } // validate tcp-specific settings if conf.requestTimeout != 0 { return errors.New("requestTimeout setting is not available in the TCP client") @@ -813,8 +1221,8 @@ func sanitizeTcpConf(conf *lineSenderConfig) error { if conf.maxBufSize != 0 { return errors.New("maxBufferSize setting is not available in the TCP client") } - if conf.maxSchemasPerConnection != 0 { - return errors.New("maxSchemasPerConnection setting is not available in the TCP client") + if err := rejectQwpOnlyOptions(conf); err != nil { + return err } if conf.tcpKey == "" && conf.tcpKeyId != "" { return errors.New("tcpKey is empty and tcpKeyId is not. both (or none) must be provided") @@ -839,6 +1247,14 @@ func sanitizeQwpConf(conf *lineSenderConfig) error { if conf.minThroughput != 0 { return errors.New("minThroughput setting is not available in the QWP client") } + if conf.retryTimeout != 0 { + // connect-string.md does not list retry_timeout as a QWP key + // (it's HTTP-only) and Sender.java rejects it on the + // WebSocket protocol. The QWP analogue is the per-outage + // reconnect budget; point the user there. + return errors.New( + "retry_timeout is not supported for QWP; use reconnect_max_duration_millis for the per-outage budget") + } if conf.httpTransport != nil { return errors.New("httpTransport setting is not available in the QWP client") } @@ -855,6 +1271,112 @@ func sanitizeQwpConf(conf *lineSenderConfig) error { if conf.protocolVersion != protocolVersionUnset { return errors.New("protocol_version setting is not available in the QWP client") } + // Multi-host failover (failover.md §1 / §2). The parser populates + // conf.endpoints for connect-string callers; functional-option + // callers go through WithAddress, which writes only conf.address. + // Back-fill endpoints from a single-host conf.address here so the + // downstream code paths can rely on len(endpoints) >= 1. + if len(conf.endpoints) == 0 && conf.address != "" { + eps, err := parseEndpointList(conf.address, qwpDefaultPort) + if err != nil { + return err + } + conf.endpoints = eps + conf.address = eps[0].String() + } + if conf.authTimeoutMs <= 0 { + conf.authTimeoutMs = 15_000 + } + // Implicit promotion of initial_connect_retry. When the user tuned + // any reconnect_* knob but did not pick an initial-connect mode, + // promote to sync — the reconnect budget they wrote should also + // cover the *first* connect attempt. Otherwise the knob name reads + // as a generic retry budget but the underlying path only governs + // reconnects from an established connection, and the budget is + // silently dropped at startup. Mirrors the Java client's + // actualInitialConnectMode resolution in Sender.java. + // + // An explicit user choice (any value of initial_connect_retry, or + // either of the With* setters) wins unconditionally — including + // "off" paired with a tuned reconnect budget for users who want + // fail-fast on startup misconfig but a generous post-connect budget. + if !conf.initialConnectModeSet && + (conf.reconnectMaxDurationMillisSet || + conf.reconnectInitialBackoffMillisSet || + conf.reconnectMaxBackoffMillisSet) { + conf.initialConnectMode = InitialConnectSync + } + // Cursor / store-and-forward validation. sf_dir activates cursor + // mode; the sf_*, sender_id, drain_orphans, max_background_drainers + // knobs are only meaningful when cursor mode is on. + if conf.sfDir == "" { + if conf.senderId != "" { + return errors.New("sender_id requires sf_dir to be set") + } + if conf.sfMaxBytes != 0 || conf.sfMaxTotalBytes != 0 || conf.sfDurability != "" || conf.sfAppendDeadlineMillis != 0 { + return errors.New("sf_max_bytes / sf_max_total_bytes / sf_durability / sf_append_deadline_millis require sf_dir to be set") + } + if conf.drainOrphans || conf.maxBackgroundDrainers != 0 { + return errors.New("drain_orphans / max_background_drainers require sf_dir to be set") + } + } + // Validate the sf_durability value space for the functional-option + // path (WithSfDurability). The connect-string parser already + // rejected flush/append/bogus, so this is a harmless re-check + // there; it is the only gate on the option path. + if err := validateSfDurability(conf.sfDurability); err != nil { + return err + } + // Validate the sender_id charset for the functional-option path + // (WithSenderId). The connect-string parser gates the parser path + // (TestSfConfRejectsBadSenderId); this is the only gate on the + // option path. Empty is the "use default" sentinel and resolves + // to qwpSfDefaultSenderId downstream — skip validateSenderId's + // strict non-empty rule for that case. Critical: senderId is used + // unmodified as a path segment under sfDir at slotPath + // construction (qwp_sender_cursor.go), so '.', '/' or '\' would + // escape the sf_dir root. + if conf.senderId != "" { + if err := validateSenderId(conf.senderId); err != nil { + return err + } + } + if conf.sfMaxBytes < 0 { + return fmt.Errorf("sf_max_bytes must be > 0: %d", conf.sfMaxBytes) + } + if conf.sfMaxTotalBytes < 0 { + return fmt.Errorf("sf_max_total_bytes must be > 0: %d", conf.sfMaxTotalBytes) + } + if conf.sfMaxBytes > 0 && conf.sfMaxTotalBytes > 0 && conf.sfMaxTotalBytes < conf.sfMaxBytes { + return fmt.Errorf("sf_max_total_bytes (%d) must be >= sf_max_bytes (%d)", + conf.sfMaxTotalBytes, conf.sfMaxBytes) + } + // Reject an explicit auto_flush_bytes that exceeds an explicit + // sf_max_bytes. The byte trigger would let a batch grow until its + // encoded frame can no longer fit a single segment, and such a frame + // can never be flushed — it is dropped at the flush boundary. Gated + // on autoFlushBytesSet so a *defaulted* 8 MiB trigger over a smaller + // user-chosen segment is left to the runtime clamp (which lowers the + // effective trigger to fit); only a user-written contradiction is a + // hard error. sf_max_bytes is the per-segment cap, so the frame must + // actually fit in slightly less than this (header overhead), but the + // trigger clamp already keeps the encoded frame under the segment; + // this check just rejects the self-evidently impossible pairing up front. + if conf.autoFlushBytesSet && conf.sfMaxBytes > 0 && int64(conf.autoFlushBytes) > conf.sfMaxBytes { + return fmt.Errorf( + "auto_flush_bytes (%d) must not exceed sf_max_bytes (%d): a batch that fills the byte trigger could not fit in a single segment", + conf.autoFlushBytes, conf.sfMaxBytes) + } + if conf.maxBackgroundDrainers < 0 { + return fmt.Errorf("max_background_drainers must be >= 0: %d", conf.maxBackgroundDrainers) + } + // Server-error API knobs (Phase 5). User-supplied + // errorInboxCapacity must be ≥ qwpSfMinErrorInboxCapacity (16); + // 0 falls back to the default at construction. + if conf.errorInboxCapacity != 0 && conf.errorInboxCapacity < qwpSfMinErrorInboxCapacity { + return fmt.Errorf("error_inbox_capacity must be >= %d: %d", + qwpSfMinErrorInboxCapacity, conf.errorInboxCapacity) + } return nil } @@ -865,6 +1387,9 @@ func sanitizeHttpConf(conf *lineSenderConfig) error { return err } + if strings.Contains(conf.address, ",") { + return errors.New("multi-host addr is not supported for HTTP") + } // validate http-specific settings if (conf.httpUser != "" || conf.httpPass != "") && conf.httpToken != "" { return errors.New("both basic and token authentication cannot be used") @@ -872,22 +1397,80 @@ func sanitizeHttpConf(conf *lineSenderConfig) error { if conf.autoFlushBytes != 0 { return errors.New("autoFlushBytes setting is not available in the HTTP client") } - if conf.maxSchemasPerConnection != 0 { - return errors.New("maxSchemasPerConnection setting is not available in the HTTP client") + if err := rejectQwpOnlyOptions(conf); err != nil { + return err } return nil } -func newQwpLineSenderFromConf(ctx context.Context, conf *lineSenderConfig) (LineSender, error) { - scheme := "ws" - if conf.tlsMode != tlsDisabled { - scheme = "wss" +// rejectQwpOnlyOptions surfaces an error when a QWP-only option was +// set on a non-QWP sender. The connect-string parser already rejects +// each of these keys on non-ws/wss schemas; this mirrors the gate +// for callers that build the config programmatically via With*. +func rejectQwpOnlyOptions(conf *lineSenderConfig) error { + if conf.errorHandler != nil || conf.errorPolicyResolver != nil || + conf.errorPolicyPerCatSet || conf.errorPolicyGlobal != PolicyAuto || + conf.errorInboxCapacity != 0 { + return errors.New("server-error API settings are only available in the QWP client") + } + var name string + switch { + case conf.sfDir != "": + name = "sf_dir" + case conf.senderId != "": + name = "sender_id" + case conf.sfMaxBytes != 0: + name = "sf_max_bytes" + case conf.sfMaxTotalBytes != 0: + name = "sf_max_total_bytes" + case conf.sfDurability != "": + name = "sf_durability" + case conf.sfAppendDeadlineMillis != 0: + name = "sf_append_deadline_millis" + case conf.drainOrphans: + name = "drain_orphans" + case conf.maxBackgroundDrainers != 0: + name = "max_background_drainers" + case conf.reconnectMaxDurationMillisSet, + conf.reconnectInitialBackoffMillisSet, + conf.reconnectMaxBackoffMillisSet: + name = "reconnect_*" + case conf.initialConnectModeSet: + name = "initial_connect_retry" + case conf.closeFlushTimeoutSet: + name = "close_flush_timeout_millis" + case conf.gorillaDisabled: + name = "gorilla" + case conf.dumpWriter != nil: + name = "QWP dump writer" + case conf.inFlightWindow != 0: + name = "in_flight_window" + case conf.authTimeoutMs != 0: + name = "auth_timeout_ms" + case conf.zone != "": + name = "zone" + case conf.target != qwpTargetAny: + name = "target" + default: + return nil } - address := scheme + "://" + conf.address + return fmt.Errorf("%s is only available in the QWP client", name) +} +func newQwpLineSenderFromConf(ctx context.Context, conf *lineSenderConfig) (LineSender, error) { opts := qwpTransportOpts{ tlsInsecureSkipVerify: conf.tlsMode == tlsInsecureSkipVerify, + endpointPath: qwpWritePath, + authTimeoutMs: conf.authTimeoutMs, + // QWP has a single protocol version; advertise it. + // serverInfoTimeout stays zero: the ingest endpoint sends no + // SERVER_INFO frame and the client never expects one — it sends + // data right after the upgrade and reads ACKs back. Ingest does + // not route by role or zone, so target= and zone= are accepted + // but inert on ingestion and honoured on the egress connect-walk + // instead. + maxVersion: qwpVersion, } // QWP auth: Basic (username:password) or Bearer (token). // Matches the Java client's buildWebSocketAuthHeader(). @@ -898,34 +1481,15 @@ func newQwpLineSenderFromConf(ctx context.Context, conf *lineSenderConfig) (Line opts.authorization = "Bearer " + conf.httpToken } - window := conf.inFlightWindow - if window <= 0 { - window = 1 - } - - s, err := newQwpLineSender(ctx, address, opts, conf.retryTimeout, - conf.autoFlushRows, conf.autoFlushInterval, conf.dumpWriter, window) - if err != nil { - return nil, err - } - s.maxBufSize = conf.maxBufSize - s.fileNameLimit = conf.fileNameLimit - s.autoFlushBytes = conf.autoFlushBytes - s.maxSchemasPerConnection = conf.maxSchemasPerConnection - if conf.closeTimeout > 0 { - s.closeTimeout = conf.closeTimeout - } - s.encoders[0].gorillaDisabled = conf.gorillaDisabled - s.encoders[1].gorillaDisabled = conf.gorillaDisabled - // Async mode's encoder buffers are pre-sized for the microbatch - // role: max(1 MB, 2 * autoFlushBytes). Matches the Java client's - // MicrobatchBuffer sizing. The 1 MB floor was already applied in - // newQwpLineSender; grow further if autoFlushBytes warrants it. - if s.asyncState != nil && conf.autoFlushBytes*2 > qwpDefaultMicrobatchBufSize { - s.encoders[0].wb.preallocate(conf.autoFlushBytes * 2) - s.encoders[1].wb.preallocate(conf.autoFlushBytes * 2) - } - return s, nil + // Both memory mode (no sf_dir) and store-and-forward (sf_dir set) + // run on the cursor engine + send loop, and both must honour the + // multi-host addr= list, the initial_connect_retry mode, and the + // reconnect_* budgets — per the README "Multi-host failover" + // section, those failover knobs apply whether or not sf_dir is set. + // The two modes differ only in the cursor engine's backing store + // (RAM vs mmapped files) and a couple of defaults, which + // newQwpCursorLineSenderFromConf resolves from conf.sfDir. + return newQwpCursorLineSenderFromConf(ctx, conf, opts) } func validateConf(conf *lineSenderConfig) error { @@ -936,8 +1500,8 @@ func validateConf(conf *lineSenderConfig) error { return fmt.Errorf("max buffer size is negative: %d", conf.maxBufSize) } - if conf.fileNameLimit < 0 { - return fmt.Errorf("file name limit is negative: %d", conf.fileNameLimit) + if conf.fileNameLimit < 16 { + return fmt.Errorf("max_name_len must be at least 16 bytes: %d", conf.fileNameLimit) } if conf.retryTimeout < 0 { @@ -956,15 +1520,9 @@ func validateConf(conf *lineSenderConfig) error { if conf.autoFlushInterval < 0 { return fmt.Errorf("auto flush interval is negative: %d", conf.autoFlushInterval) } - if conf.closeTimeout < 0 { - return fmt.Errorf("close timeout is negative: %d", conf.closeTimeout) - } if conf.autoFlushBytes < 0 { return fmt.Errorf("auto flush bytes is negative: %d", conf.autoFlushBytes) } - if conf.maxSchemasPerConnection < 0 { - return fmt.Errorf("max schemas per connection is negative: %d", conf.maxSchemasPerConnection) - } if conf.protocolVersion < protocolVersionUnset || conf.protocolVersion > ProtocolVersion3 { return errors.New("current client only supports protocol version 1 (text format for all datatypes), " + "2 (binary format for floats/arrays), 3 (binary decimals) or explicitly unset") diff --git a/sender_error.go b/sender_error.go new file mode 100644 index 00000000..01d75919 --- /dev/null +++ b/sender_error.go @@ -0,0 +1,356 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +// Package questdb provides the QuestDB ingestion clients. +// +// SenderError is the QWP cursor-SF server-error payload. It surfaces in +// two ways: +// +// 1. Asynchronously, to a registered SenderErrorHandler: +// +// opts := []questdb.LineSenderOption{ +// questdb.WithQwp(), +// questdb.WithErrorHandler(func(e *questdb.SenderError) { +// log.Printf("dead-lettering FSN [%d,%d]: %v", e.FromFsn, e.ToFsn, e) +// // ... persist e for replay or alerting ... +// }), +// } +// +// 2. Synchronously, on the next producer-thread API call after a HALT +// policy has been latched: +// +// if err := s.Flush(ctx); err != nil { +// var se *questdb.SenderError +// if errors.As(err, &se) { +// // unpack se.Category, se.ServerMessage, se.FromFsn, ... +// } +// } +// +// Both paths deliver the same payload. The producer-side typed error is +// the FSN's-eye-view of "what was rejected"; the async handler is the +// dead-letter channel for DROP_AND_CONTINUE batches. +package questdb + +import ( + "fmt" + "time" +) + +// Category classifies a QWP server-side rejection. Categories align 1:1 +// with stable wire status bytes (SchemaMismatch / ParseError / +// InternalError / SecurityError / WriteError) plus ProtocolViolation +// (WebSocket close-frame violations) and Unknown (forward-compat for +// new server status bytes). +type Category byte + +const ( + // CategoryUnknown is the zero value and the fallback for any + // status byte the client does not recognize. Forced HALT. + CategoryUnknown Category = iota + // CategorySchemaMismatch: column type incompatible with existing + // table, missing column, NOT NULL violation, no such table. + // Wire status 0x03. + CategorySchemaMismatch + // CategoryParseError: QWP-level malformed payload — likely a + // client bug. Wire status 0x05. + CategoryParseError + // CategoryInternalError: catch-all server fault (CairoException + // isCritical, unhandled Throwable). Wire status 0x06. + CategoryInternalError + // CategorySecurityError: authentication or authorization failure. + // Wire status 0x08, also produced by 401/403 on the WebSocket + // upgrade. + CategorySecurityError + // CategoryWriteError: non-critical Cairo error, table not + // accepting writes. Wire status 0x09. + CategoryWriteError + // CategoryProtocolViolation: WebSocket-layer close frame with a + // terminal code (PROTOCOL_ERROR 1002, UNSUPPORTED_DATA 1003, + // INVALID_PAYLOAD_DATA 1007, POLICY_VIOLATION 1008, + // MESSAGE_TOO_BIG 1009, MANDATORY_EXTENSION 1010), or 404/426 + // upgrade rejection. Forced HALT. + CategoryProtocolViolation + + numCategories // sentinel: must be last +) + +// String returns the canonical name of the category. Stable across +// releases — safe to log and grep. +func (c Category) String() string { + switch c { + case CategoryUnknown: + return "UNKNOWN" + case CategorySchemaMismatch: + return "SCHEMA_MISMATCH" + case CategoryParseError: + return "PARSE_ERROR" + case CategoryInternalError: + return "INTERNAL_ERROR" + case CategorySecurityError: + return "SECURITY_ERROR" + case CategoryWriteError: + return "WRITE_ERROR" + case CategoryProtocolViolation: + return "PROTOCOL_VIOLATION" + default: + return fmt.Sprintf("Category(%d)", byte(c)) + } +} + +// Policy is the action the SF send loop took when a category fired. +// Resolution precedence (highest first): builder errorPolicyResolver → +// builder per-category errorPolicy → connect-string per-category +// on_*_error → connect-string global on_server_error → spec defaults. +// +// CategoryProtocolViolation and CategoryUnknown are forced HALT; user +// overrides for those categories are ignored. +type Policy byte + +const ( + // PolicyAuto is the zero value, used as a sentinel meaning + // "fall through to the next layer of resolution". Never appears + // on a delivered SenderError — the loop always resolves to a + // concrete policy before building the error. + PolicyAuto Policy = iota + // PolicyDropAndContinue: advance ackedFsn past the rejected + // span and keep draining. The data is dropped from the SF disk + // store; users wanting durability must dead-letter via + // SenderErrorHandler. + PolicyDropAndContinue + // PolicyHalt: latch the error as terminal. The next + // producer-thread API call returns the SenderError; the sender + // does not drain further until the caller closes and rebuilds + // it. + PolicyHalt +) + +// String returns the canonical name of the policy. Stable across +// releases — safe to log and grep. +func (p Policy) String() string { + switch p { + case PolicyAuto: + return "AUTO" + case PolicyDropAndContinue: + return "DROP_AND_CONTINUE" + case PolicyHalt: + return "HALT" + default: + return fmt.Sprintf("Policy(%d)", byte(p)) + } +} + +// Sentinel field values on SenderError. Use these instead of literal +// numbers so cross-language users see the same intent. +const ( + // NoStatusByte signals SenderError carries no QWP status byte — + // CategoryProtocolViolation does not come from a server status + // frame. Stored as int because Go has no nullable byte. + NoStatusByte = -1 + // NoMessageSequence signals SenderError carries no per-frame + // sequence number — same case as NoStatusByte. + NoMessageSequence int64 = -1 +) + +// SenderError is the immutable description of a server-side rejection +// of an asynchronously published QWP batch. It is delivered to user +// code via the registered SenderErrorHandler (async) and as the typed +// error returned from the next producer-thread API call after a HALT +// (sync). Both paths carry the same payload. +// +// SenderError implements the error interface, so it can be passed +// directly through error-returning APIs and unwrapped via errors.As: +// +// var se *questdb.SenderError +// if errors.As(err, &se) { ... } +// +// The [FromFsn, ToFsn] span is the load-bearing correlation key — +// join it to whatever the producer logged alongside the value +// returned by FlushAndGetSequence to identify the rejected data. +type SenderError struct { + // Category is the rejection classification. The recommended + // switch target. + Category Category + + // AppliedPolicy is what the loop actually did about the + // rejection — DROP_AND_CONTINUE means the data was dropped + // from disk; HALT means a terminal latch is in place. + AppliedPolicy Policy + + // ServerStatusByte is the raw QWP status byte (e.g. 0x03 for + // SCHEMA_MISMATCH). Set to NoStatusByte for + // CategoryProtocolViolation. Stored as int to allow the + // sentinel. + ServerStatusByte int + + // ServerMessage is the human-readable description provided by + // the server (≤1024 UTF-8 bytes for QWP error frames, or the + // WebSocket close reason for protocol violations). Empty if + // the server provided no text. + ServerMessage string + + // MessageSequence is the server's per-frame messageSequence as + // mirrored back in the rejection frame, used for cross-team + // debugging and to correlate against server-side logs. Set to + // NoMessageSequence for CategoryProtocolViolation. + MessageSequence int64 + + // FromFsn is the inclusive lower bound of the FSN span for the + // rejected batch — the correlation key for joining against + // FlushAndGetSequence values on the producer side. + FromFsn int64 + + // ToFsn is the inclusive upper bound of the FSN span for the + // rejected batch. + ToFsn int64 + + // TableName is the rejected table name, when the server + // attributed the error to a single table. Empty string means + // "unknown" or "multi-table batch" — the server does not + // attribute multi-table batch errors today. + TableName string + + // DetectedAt is the wall-clock-independent receipt time on the + // I/O goroutine. Use for ordering and ops timelines, not for + // correlation. + DetectedAt time.Time +} + +// Error implements the error interface. The format is stable enough +// to grep on but is intended for human consumption; programmatic +// callers should switch on Category, ServerStatusByte, etc. +func (e *SenderError) Error() string { + if e == nil { + return "" + } + var sb []byte + sb = append(sb, "qwp: server rejected batch: "...) + sb = append(sb, e.Category.String()...) + if e.ServerStatusByte != NoStatusByte { + sb = append(sb, fmt.Sprintf(" (status=0x%02X %s)", + byte(e.ServerStatusByte), + qwpStatusName(QwpStatusCode(e.ServerStatusByte)))...) + } + sb = append(sb, fmt.Sprintf(" policy=%s fsn=[%d,%d]", + e.AppliedPolicy, e.FromFsn, e.ToFsn)...) + if e.TableName != "" { + sb = append(sb, fmt.Sprintf(" table=%s", e.TableName)...) + } + if e.MessageSequence != NoMessageSequence { + sb = append(sb, fmt.Sprintf(" seq=%d", e.MessageSequence)...) + } + if e.ServerMessage != "" { + sb = append(sb, " — "...) + sb = append(sb, e.ServerMessage...) + } + return string(sb) +} + +// ---------------------------------------------------------------------- +// Deprecated v4.2.0 compatibility shim. Delete this whole block in +// v4.4.0 (one minor after the SenderError replacement landed in +// v4.3.0): the QwpError type, its Error method, and the +// (*SenderError).As bridge below exist only so source written against +// v4.2.0's QwpError keeps compiling across the upgrade. +// ---------------------------------------------------------------------- + +// QwpError was the v4.2.0 QWP server-rejection payload returned from +// Flush and delivered to the async error path. v4.3.0 replaced it with +// SenderError, which additionally carries the [FromFsn, ToFsn] +// correlation span, the applied Policy, table attribution, and a +// release-stable Category. +// +// Deprecated: use SenderError. This shim only keeps v4.2.0 source +// compiling and is scheduled for removal in v4.4.0. The +// (*SenderError).As bridge keeps the historical pattern working: +// +// var qwpErr *questdb.QwpError +// if errors.As(err, &qwpErr) { /* still populated, from *SenderError */ } +// +// A type switch `case *questdb.QwpError:` will NOT match anymore — +// Flush now returns *SenderError — so switch on *SenderError (or its +// Category) instead. Field mapping from the old payload: +// +// QwpError.Status ← SenderError.ServerStatusByte (Category for the name) +// QwpError.Sequence ← SenderError.MessageSequence +// QwpError.Message ← SenderError.ServerMessage +type QwpError struct { + // Status is the raw QWP status byte from the server's ACK + // rejection. Zero (the QwpStatusOK byte) when the underlying + // SenderError is a CategoryProtocolViolation, which v4.2.0 never + // surfaced through this type. + // + // Deprecated: read SenderError.ServerStatusByte / .Category. + Status QwpStatusCode + + // Sequence is the server's per-frame message sequence, mirrored + // back in the rejection frame. + // + // Deprecated: read SenderError.MessageSequence. + Sequence int64 + + // Message is the server-supplied error description, or empty if + // the server sent no text. + // + // Deprecated: read SenderError.ServerMessage. + Message string +} + +// Error implements the error interface, preserving the exact v4.2.0 +// message format so adopters that grep their logs see no change. +// +// Deprecated: use SenderError. +func (e *QwpError) Error() string { + name := qwpStatusName(e.Status) + if e.Message != "" { + return fmt.Sprintf("qwp: server error %s (0x%02X): %s", + name, byte(e.Status), e.Message) + } + return fmt.Sprintf("qwp: server error %s (0x%02X)", + name, byte(e.Status)) +} + +// As bridges the deprecated *QwpError shim onto the SenderError +// payload so the historical errors.As(err, &qwpErr) pattern keeps +// working after the v4.3.0 type replacement. errors.As resolves +// **SenderError by assignability before consulting this method, so the +// only target we handle is **QwpError; everything else falls through +// to the standard walk. +// +// Deprecated: exists solely for the QwpError shim; removed with it. +func (e *SenderError) As(target any) bool { + qe, ok := target.(**QwpError) + if !ok { + return false + } + status := QwpStatusCode(0) + if e.ServerStatusByte != NoStatusByte { + status = QwpStatusCode(byte(e.ServerStatusByte)) + } + *qe = &QwpError{ + Status: status, + Sequence: e.MessageSequence, + Message: e.ServerMessage, + } + return true +} diff --git a/sender_error_handler.go b/sender_error_handler.go new file mode 100644 index 00000000..b0b16373 --- /dev/null +++ b/sender_error_handler.go @@ -0,0 +1,72 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +// SenderErrorHandler is the user-supplied callback invoked when the +// asynchronous SF send loop observes a server-side batch rejection. +// Registered via WithErrorHandler(...) on the LineSender builder. +// +// # Threading +// +// Implementations are invoked on a dedicated dispatcher goroutine, +// never on the I/O goroutine or the producer goroutine. Slow handlers +// cannot stall publishing; if the bounded inbox fills up, surplus +// notifications are dropped (visible via +// QwpSender.DroppedErrorNotifications()). +// +// # Panics +// +// Any panic from the handler is recovered and logged by the +// dispatcher. The dispatcher and the sender continue running. +// +// # Calling back into the sender +// +// The handler may call Close() or Flush() on the sender — e.g. to shut +// down on a HALT-category error. The terminal *SenderError is latched +// before the handler is invoked, so a synchronous Flush() returns it +// promptly rather than blocking. Close() called from the handler is +// honored and returns without deadlocking; the dispatcher goroutine +// (this goroutine) finishes unwinding on its own once the handler +// returns, so any error notifications still queued at that moment are +// subject to the dispatcher's short best-effort drain and may be +// dropped (visible via QwpSender.DroppedErrorNotifications()). +// +// Because the handler runs on the dispatcher goroutine — not the +// producer goroutine — these calls deliberately do NOT touch producer- +// buffered state: a handler-invoked Close() or Flush() will not flush +// rows the producer has staged but not yet flushed itself (those are +// owned by the producer goroutine and may be mid-assembly). Close() +// still tears down the wire, drains already-published frames up to +// close_flush_timeout, and releases resources; Flush() still surfaces +// the latched error. To guarantee a specific batch is flushed, flush it +// from the producer goroutine before relying on the handler to close. +// +// # What this callback is for +// +// Dead-lettering rejected data, alerting, metrics. Producer-thread +// retry/abort logic should not live here — that belongs on the +// producer side, where errors.As(err, &senderErr) unpacks the typed +// error after a HALT-policy latch. +type SenderErrorHandler func(*SenderError) diff --git a/sender_error_test.go b/sender_error_test.go new file mode 100644 index 00000000..72859d36 --- /dev/null +++ b/sender_error_test.go @@ -0,0 +1,232 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package questdb + +import ( + "encoding/binary" + "errors" + "strings" + "testing" + "time" +) + +func TestSenderErrorImplementsError(t *testing.T) { + se := &SenderError{ + Category: CategoryParseError, + AppliedPolicy: PolicyHalt, + ServerStatusByte: int(QwpStatusParseError), + ServerMessage: "bad column type", + MessageSequence: 42, + FromFsn: 100, + ToFsn: 100, + DetectedAt: time.Now(), + } + var err error = se + s := err.Error() + for _, want := range []string{"PARSE_ERROR", "bad column type", "0x05", "HALT", "fsn=[100,100]", "seq=42"} { + if !strings.Contains(s, want) { + t.Fatalf("error string missing %q: %s", want, s) + } + } +} + +func TestSenderErrorNoMessage(t *testing.T) { + se := &SenderError{ + Category: CategoryWriteError, + AppliedPolicy: PolicyDropAndContinue, + ServerStatusByte: int(QwpStatusWriteError), + MessageSequence: 1, + FromFsn: 5, + ToFsn: 5, + } + s := se.Error() + for _, want := range []string{"WRITE_ERROR", "DROP_AND_CONTINUE", "fsn=[5,5]"} { + if !strings.Contains(s, want) { + t.Fatalf("error string missing %q: %s", want, s) + } + } + if strings.Contains(s, "—") { + t.Fatalf("expected no trailing message separator: %s", s) + } +} + +func TestSenderErrorProtocolViolationNoStatus(t *testing.T) { + se := &SenderError{ + Category: CategoryProtocolViolation, + AppliedPolicy: PolicyHalt, + ServerStatusByte: NoStatusByte, + MessageSequence: NoMessageSequence, + ServerMessage: "ws-close[1002]: bad framing", + FromFsn: 7, + ToFsn: 12, + } + s := se.Error() + for _, want := range []string{"PROTOCOL_VIOLATION", "ws-close[1002]: bad framing", "fsn=[7,12]"} { + if !strings.Contains(s, want) { + t.Fatalf("error string missing %q: %s", want, s) + } + } + for _, unwanted := range []string{"status=", "seq="} { + if strings.Contains(s, unwanted) { + t.Fatalf("error string should omit %q for ProtocolViolation: %s", unwanted, s) + } + } +} + +func TestSenderErrorIsErrorsAsTarget(t *testing.T) { + se := &SenderError{Category: CategoryParseError, AppliedPolicy: PolicyHalt} + var err error = se + var got *SenderError + if !errors.As(err, &got) { + t.Fatal("errors.As did not unwrap *SenderError") + } + if got.Category != CategoryParseError { + t.Fatalf("unwrapped Category = %s, want PARSE_ERROR", got.Category) + } +} + +func TestSenderErrorNilSafe(t *testing.T) { + var se *SenderError + if got := se.Error(); got != "" { + t.Fatalf("nil error string = %q", got) + } +} + +func TestCategoryString(t *testing.T) { + tests := []struct { + c Category + want string + }{ + {CategoryUnknown, "UNKNOWN"}, + {CategorySchemaMismatch, "SCHEMA_MISMATCH"}, + {CategoryParseError, "PARSE_ERROR"}, + {CategoryInternalError, "INTERNAL_ERROR"}, + {CategorySecurityError, "SECURITY_ERROR"}, + {CategoryWriteError, "WRITE_ERROR"}, + {CategoryProtocolViolation, "PROTOCOL_VIOLATION"}, + {Category(99), "Category(99)"}, + } + for _, tc := range tests { + if got := tc.c.String(); got != tc.want { + t.Fatalf("Category(%d).String() = %q, want %q", tc.c, got, tc.want) + } + } +} + +func TestPolicyString(t *testing.T) { + tests := []struct { + p Policy + want string + }{ + {PolicyAuto, "AUTO"}, + {PolicyDropAndContinue, "DROP_AND_CONTINUE"}, + {PolicyHalt, "HALT"}, + {Policy(7), "Policy(7)"}, + } + for _, tc := range tests { + if got := tc.p.String(); got != tc.want { + t.Fatalf("Policy(%d).String() = %q, want %q", tc.p, got, tc.want) + } + } +} + +func TestQwpStatusName(t *testing.T) { + tests := []struct { + status QwpStatusCode + want string + }{ + {QwpStatusOK, "OK"}, + {QwpStatusDurableAck, "DURABLE_ACK"}, + {QwpStatusSchemaMismatch, "SCHEMA_MISMATCH"}, + {QwpStatusParseError, "PARSE_ERROR"}, + {QwpStatusInternalError, "INTERNAL_ERROR"}, + {QwpStatusSecurityError, "SECURITY_ERROR"}, + {QwpStatusWriteError, "WRITE_ERROR"}, + {QwpStatusCode(42), "UNKNOWN(42)"}, + } + for _, tc := range tests { + if got := qwpStatusName(tc.status); got != tc.want { + t.Fatalf("qwpStatusName(0x%02X) = %q, want %q", + byte(tc.status), got, tc.want) + } + } +} + +func TestParseAckErrorPayload(t *testing.T) { + t.Run("OK", func(t *testing.T) { + data := make([]byte, 11) + data[0] = byte(QwpStatusOK) + status, seq, msg := parseAckErrorPayload(data) + if status != QwpStatusOK || seq != 0 || msg != "" { + t.Fatalf("OK payload: status=%d seq=%d msg=%q", status, seq, msg) + } + }) + + t.Run("DurableAck", func(t *testing.T) { + data := make([]byte, 3) + data[0] = byte(QwpStatusDurableAck) + status, seq, msg := parseAckErrorPayload(data) + if status != QwpStatusDurableAck || seq != 0 || msg != "" { + t.Fatalf("DurableAck payload: status=%d seq=%d msg=%q", status, seq, msg) + } + }) + + t.Run("ParseError", func(t *testing.T) { + errMsg := "invalid column" + data := make([]byte, 11+len(errMsg)) + data[0] = byte(QwpStatusParseError) + binary.LittleEndian.PutUint64(data[1:9], 7) + binary.LittleEndian.PutUint16(data[9:11], uint16(len(errMsg))) + copy(data[11:], errMsg) + + status, seq, msg := parseAckErrorPayload(data) + if status != QwpStatusParseError { + t.Fatalf("status = %d, want PARSE_ERROR", status) + } + if seq != 7 { + t.Fatalf("seq = %d, want 7", seq) + } + if msg != errMsg { + t.Fatalf("msg = %q, want %q", msg, errMsg) + } + }) + + t.Run("WriteErrorNoMessage", func(t *testing.T) { + data := make([]byte, 11) + data[0] = byte(QwpStatusWriteError) + binary.LittleEndian.PutUint64(data[1:9], 99) + + status, seq, msg := parseAckErrorPayload(data) + if status != QwpStatusWriteError { + t.Fatalf("status = %d, want WRITE_ERROR", status) + } + if seq != 99 { + t.Fatalf("seq = %d, want 99", seq) + } + if msg != "" { + t.Fatalf("msg = %q, want empty", msg) + } + }) +} diff --git a/sender_pool.go b/sender_pool.go index 0b6d6836..a2361f52 100644 --- a/sender_pool.go +++ b/sender_pool.go @@ -37,7 +37,7 @@ import ( var ( errAcquireFromClosedPool = errors.New("cannot acquire a LineSender from a closed LineSenderPool") - errHttpOnlySender = errors.New("tcp/s not supported for pooled senders, use http/s only") + errHttpOnlySender = errors.New("only http/s schemas are supported for pooled senders (tcp/s, ws/wss, qwpws/qwpwss are not)") errPooledSenderClose = errors.New("error closing one or more LineSenders in the pool") ) @@ -77,7 +77,7 @@ type LineSenderPoolOption func(*LineSenderPool) // The default maximum number of senders is 64, but can be customized by using the // [WithMaxSenders] option. func PoolFromConf(conf string, opts ...LineSenderPoolOption) (*LineSenderPool, error) { - if strings.HasPrefix(conf, "tcp") { + if !strings.HasPrefix(conf, "http::") && !strings.HasPrefix(conf, "https::") { return nil, errHttpOnlySender } @@ -177,7 +177,7 @@ func (p *LineSenderPool) Sender(ctx context.Context) (LineSender, error) { conf := newLineSenderConfig(httpSenderType) for _, opt := range p.opts { opt(conf) - if conf.senderType == tcpSenderType { + if conf.senderType != httpSenderType { return nil, errHttpOnlySender } } diff --git a/sender_pool_test.go b/sender_pool_test.go index 548c9a7e..b982d594 100644 --- a/sender_pool_test.go +++ b/sender_pool_test.go @@ -223,28 +223,39 @@ func TestMultiThreadedPoolWritesOverHttp(t *testing.T) { lines := []string{} - go func() { + assert.Eventually(t, func() bool { for { select { case msg := <-srv.BackCh: lines = append(lines, msg) - case <-srv.closeCh: - return default: - continue + return len(lines) == numThreads } } - }() - - assert.Eventually(t, func() bool { - return len(lines) == numThreads - }, time.Second, 100*time.Millisecond, "expected %d flushed lines but only received %d") + }, time.Second, 100*time.Millisecond, "expected %d flushed lines but only received %d", numThreads, len(lines)) } -func TestTcpNotSupported(t *testing.T) { - _, err := qdb.PoolFromConf("tcp::addr=localhost:9000") - assert.ErrorContains(t, err, "tcp/s not supported for pooled senders") +func TestNonHttpSchemasNotSupported(t *testing.T) { + cases := []string{ + "tcp::addr=localhost:9000", + "tcps::addr=localhost:9000", + "ws::addr=localhost:9000", + "wss::addr=localhost:9000", + "qwpws::addr=localhost:9000", + "qwpwss::addr=localhost:9000", + "grpc::addr=localhost:9000", + } + for _, conf := range cases { + t.Run(conf, func(t *testing.T) { + _, err := qdb.PoolFromConf(conf) + assert.ErrorContains(t, err, "only http/s") + }) + } +} - _, err = qdb.PoolFromConf("tcps::addr=localhost:9000") - assert.ErrorContains(t, err, "tcp/s not supported for pooled senders") +func TestPoolFromOptionsRejectsQwp(t *testing.T) { + p, err := qdb.PoolFromOptions(qdb.WithQwp(), qdb.WithAddress("localhost:9000")) + require.NoError(t, err) + _, err = p.Sender(context.Background()) + assert.ErrorContains(t, err, "only http/s") } diff --git a/system_test/enterprise_e2e/conftest.py b/system_test/enterprise_e2e/conftest.py new file mode 100644 index 00000000..99f368a6 --- /dev/null +++ b/system_test/enterprise_e2e/conftest.py @@ -0,0 +1,225 @@ +""" +Pytest root config for go-questdb-client Enterprise e2e tests. + +Registers the Enterprise shared_fixtures plugin (server_factory, +scenario_dir, obj_store, etc.) and adds a ``go_sidecar`` fixture +that launches the pre-built Go sidecar binary. + +The QUESTDB_ENTERPRISE_E2E_DIR environment variable must point at +the ``questdb-ent/e2e`` directory in the Enterprise checkout so the +plugin module is importable. +""" + +from __future__ import annotations + +import logging +import os +import signal +import subprocess +import time +from dataclasses import dataclass, field +from pathlib import Path +from threading import Thread +from typing import IO, Iterator, Optional + +import pytest +import sys + +_ent_e2e = os.environ.get("QUESTDB_ENTERPRISE_E2E_DIR") +if _ent_e2e: + sys.path.insert(0, _ent_e2e) + +pytest_plugins = ("lib.shared_fixtures",) + +LOG = logging.getLogger(__name__) + +SIDECAR_DIR = Path(__file__).resolve().parent / "sidecar" +SIDECAR_BIN = SIDECAR_DIR / "go-e2e-sidecar" + + +class GoSidecarError(RuntimeError): + pass + + +@dataclass +class GoSidecarStats: + acked: int + sent: int + acks: int + reconn_attempts: int + reconn_succ: int + server_errors: int + + +@dataclass +class GoSidecar: + log_dir: Path + name: str = "go-sidecar" + + process: Optional[subprocess.Popen] = field(default=None, init=False, repr=False) + _stderr_thread: Optional[Thread] = field(default=None, init=False, repr=False) + + def start(self, *, ready_timeout: float = 30.0) -> None: + if self.process is not None: + raise RuntimeError(f"sidecar {self.name!r} already started") + + binary = SIDECAR_BIN + if not binary.exists(): + raise FileNotFoundError( + f"sidecar binary not found at {binary}; " + f"run 'go build -o go-e2e-sidecar .' in {SIDECAR_DIR} first" + ) + + cmd = [str(binary)] + self.log_dir.mkdir(parents=True, exist_ok=True) + stderr_log = open(self.log_dir / f"{self.name}.stderr.log", "w", encoding="utf-8") + + LOG.info("starting Go sidecar %s", self.name) + self.process = subprocess.Popen( + cmd, + env=os.environ.copy(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, + ) + + self._stderr_thread = _drain(self.process.stderr, stderr_log, f"{self.name}-stderr") + + deadline = time.monotonic() + ready_timeout + while True: + if self.process.poll() is not None: + raise RuntimeError( + f"sidecar {self.name!r} exited prematurely " + f"(code {self.process.returncode}); see " + f"{self.log_dir / f'{self.name}.stderr.log'}" + ) + if time.monotonic() > deadline: + raise TimeoutError( + f"sidecar {self.name!r} did not READY within {ready_timeout}s" + ) + line = _readline(self.process.stdout, 0.5) + if line is None: + continue + line = line.strip() + if line == "READY": + break + LOG.warning("sidecar %s pre-READY: %r", self.name, line) + + def stop(self) -> None: + if self.process is None or self.process.poll() is not None: + return + try: + self._send("EXIT") + except (BrokenPipeError, OSError): + pass + try: + self.process.wait(timeout=15) + except subprocess.TimeoutExpired: + LOG.warning("sidecar %s did not exit after EXIT, escalating to SIGKILL", self.name) + self.process.kill() + self.process.wait(timeout=5) + + def kill_9(self) -> None: + if self.process is None or self.process.poll() is not None: + return + LOG.info("kill -9 sidecar %s pid=%d", self.name, self.process.pid) + try: + os.killpg(os.getpgid(self.process.pid), signal.SIGKILL) + except ProcessLookupError: + pass + try: + self.process.wait(timeout=10) + except subprocess.TimeoutExpired: + LOG.error("sidecar %s did not exit after SIGKILL within 10s", self.name) + + # ---- protocol verbs ---- + + def connect(self, connect_string: str) -> None: + self._send(f"CONNECT {connect_string}") + self._expect_ok() + + def send(self, table: str, count: int, start_index: int = 0) -> None: + self._send(f"SEND {table} {count} {start_index}") + self._expect_ok() + + def flush(self) -> int: + self._send("FLUSH") + reply = self._expect_ok() + return int(reply[0]) if reply else -1 + + def await_acked(self, fsn: int, timeout_ms: int) -> bool: + self._send(f"AWAIT_ACKED {fsn} {timeout_ms}") + reply = self._expect_ok() + return reply[0] == "true" if reply else False + + def stats(self) -> GoSidecarStats: + self._send("STATS") + reply = self._expect_ok() + kv = dict(p.split("=", 1) for p in reply if "=" in p) + return GoSidecarStats( + acked=int(kv.get("acked", -1)), + sent=int(kv.get("sent", 0)), + acks=int(kv.get("acks", 0)), + reconn_attempts=int(kv.get("reconnAttempts", 0)), + reconn_succ=int(kv.get("reconnSucc", 0)), + server_errors=int(kv.get("serverErrors", 0)), + ) + + def close(self) -> None: + self._send("CLOSE") + self._expect_ok() + + # ---- internals ---- + + def _send(self, line: str) -> None: + if self.process is None or self.process.poll() is not None: + raise RuntimeError(f"sidecar {self.name!r} is not running") + assert self.process.stdin is not None + self.process.stdin.write((line + "\n").encode("utf-8")) + self.process.stdin.flush() + + def _expect_ok(self) -> list[str]: + if self.process is None: + raise RuntimeError("sidecar not running") + line = _readline(self.process.stdout, 60.0) + if line is None: + raise RuntimeError("sidecar produced no reply (timeout or EOF)") + line = line.strip() + if line.startswith("OK"): + return line.split()[1:] + if line.startswith("ERR"): + raise GoSidecarError(line[len("ERR "):]) + raise RuntimeError(f"unexpected sidecar reply: {line!r}") + + +def _readline(stream: IO[bytes], timeout: float) -> Optional[str]: + import select + readable, _, _ = select.select([stream], [], [], timeout) + if not readable: + return None + line = stream.readline() + if not line: + return None + return line.decode("utf-8", errors="replace") + + +def _drain(stream: IO[bytes], sink, label: str) -> Thread: + def _run(): + for raw in stream: + sink.write(raw.decode("utf-8", errors="replace")) + sink.close() + + t = Thread(target=_run, name=label, daemon=True) + t.start() + return t + + +@pytest.fixture(scope="function") +def go_sidecar(log_dir: Path) -> Iterator[GoSidecar]: + s = GoSidecar(log_dir=log_dir, name="go-sidecar") + s.start() + try: + yield s + finally: + s.stop() diff --git a/system_test/enterprise_e2e/pyproject.toml b/system_test/enterprise_e2e/pyproject.toml new file mode 100644 index 00000000..997b2d2f --- /dev/null +++ b/system_test/enterprise_e2e/pyproject.toml @@ -0,0 +1,24 @@ +[project] +name = "go-questdb-client-enterprise-e2e" +version = "0.1.0" +description = "Enterprise e2e tests for the Go QuestDB client (QWiP durable-ack)." +requires-python = ">=3.10" +dependencies = [ + "pytest>=8.0", + "pytest-randomly>=3.15", + "psycopg[binary]>=3.1", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = [ + "-ra", + "-v", + "--strict-markers", + "--tb=short", +] +markers = [ + "go_client: Go client e2e tests against Enterprise QuestDB", +] +log_cli = true +log_cli_level = "INFO" diff --git a/system_test/enterprise_e2e/sidecar/go.mod b/system_test/enterprise_e2e/sidecar/go.mod new file mode 100644 index 00000000..b1caef7b --- /dev/null +++ b/system_test/enterprise_e2e/sidecar/go.mod @@ -0,0 +1,13 @@ +module github.com/questdb/go-questdb-client/v4/system_test/enterprise_e2e/sidecar + +go 1.23 + +require github.com/questdb/go-questdb-client/v4 v4.0.0 + +require ( + github.com/coder/websocket v1.8.14 // indirect + github.com/klauspost/compress v1.18.4 // indirect + golang.org/x/sys v0.16.0 // indirect +) + +replace github.com/questdb/go-questdb-client/v4 => ../../.. diff --git a/system_test/enterprise_e2e/sidecar/go.sum b/system_test/enterprise_e2e/sidecar/go.sum new file mode 100644 index 00000000..a05ef12d --- /dev/null +++ b/system_test/enterprise_e2e/sidecar/go.sum @@ -0,0 +1,94 @@ +dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= +dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= +github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= +github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= +github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0= +github.com/containerd/containerd v1.7.12/go.mod h1:/5OMpE1p0ylxtEUGY8kuCYkDRzJm9NO1TFMWjUpdevk= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= +github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= +github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= +github.com/docker/docker v24.0.9+incompatible h1:HPGzNmwfLZWdxHqK9/II92pyi1EpYKsAqcl4G0Of9v0= +github.com/docker/docker v24.0.9+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= +github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a h1:N9zuLhTvBSRt0gWSiJswwQ2HqDmtX/ZCDJURnKUt1Ik= +github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a/go.mod h1:JKx41uQRwqlTZabZc+kILPrO/3jlKnQ2Z8b7YiVw5cE= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= +github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= +github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= +github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= +github.com/opencontainers/runc v1.1.5/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b h1:0LFwY6Q3gMACTjAbMZBjXAqTOzOwFaj2Ld6cjeQ7Rig= +github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= +github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c= +github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw= +github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs= +golang.org/x/exp v0.0.0-20231005195138-3e424a577f31/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= +golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= +golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97 h1:6GQBEOdGkX6MMTLT9V+TjtIRZCw9VPD5Z+yHY9wMgS0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97/go.mod h1:v7nGkzlmW8P3n/bKmWBn2WpBjpOEx8Q6gMueudAmKfY= +google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= +google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/system_test/enterprise_e2e/sidecar/main.go b/system_test/enterprise_e2e/sidecar/main.go new file mode 100644 index 00000000..13da71a3 --- /dev/null +++ b/system_test/enterprise_e2e/sidecar/main.go @@ -0,0 +1,187 @@ +/*+***************************************************************************** + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2026 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +package main + +import ( + "bufio" + "context" + "fmt" + "os" + "strconv" + "strings" + "time" + + qdb "github.com/questdb/go-questdb-client/v4" +) + +func main() { + fmt.Println("READY") + + var sender qdb.LineSender + scanner := bufio.NewScanner(os.Stdin) + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + parts := strings.Fields(line) + verb := strings.ToUpper(parts[0]) + + switch verb { + case "CONNECT": + connectString := strings.TrimSpace(line[len(parts[0]):]) + if sender != nil { + closeSender(sender) + } + var err error + sender, err = qdb.LineSenderFromConf(context.Background(), connectString) + if err != nil { + reply("ERR " + err.Error()) + continue + } + reply("OK") + + case "SEND": + if sender == nil { + reply("ERR no active sender; call CONNECT first") + continue + } + table := parts[1] + count, _ := strconv.Atoi(parts[2]) + startIndex := 0 + if len(parts) > 3 { + startIndex, _ = strconv.Atoi(parts[3]) + } + var lastErr error + for i := 0; i < count; i++ { + idx := startIndex + i + err := sender. + Table(table). + Symbol("tag", fmt.Sprintf("test_%d", idx)). + Int64Column("v", int64(idx)). + At(context.Background(), time.Now()) + if err != nil { + lastErr = err + break + } + } + if lastErr != nil { + reply("ERR " + lastErr.Error()) + } else { + reply("OK") + } + + case "FLUSH": + if sender == nil { + reply("ERR no active sender; call CONNECT first") + continue + } + if qwp, ok := sender.(qdb.QwpSender); ok { + fsn, err := qwp.FlushAndGetSequence(context.Background()) + if err != nil { + reply("ERR " + err.Error()) + } else { + reply(fmt.Sprintf("OK %d", fsn)) + } + } else { + err := sender.Flush(context.Background()) + if err != nil { + reply("ERR " + err.Error()) + } else { + reply("OK -1") + } + } + + case "AWAIT_ACKED": + if sender == nil { + reply("ERR no active sender; call CONNECT first") + continue + } + fsn, _ := strconv.ParseInt(parts[1], 10, 64) + timeoutMs, _ := strconv.Atoi(parts[2]) + if qwp, ok := sender.(qdb.QwpSender); ok { + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMs)*time.Millisecond) + err := qwp.AwaitAckedFsn(ctx, fsn) + cancel() + if err != nil { + reply("OK false") + } else { + reply("OK true") + } + } else { + reply("OK true") + } + + case "STATS": + if sender == nil { + reply("ERR no active sender; call CONNECT first") + continue + } + if qwp, ok := sender.(qdb.QwpSender); ok { + reply(fmt.Sprintf("OK acked=%d sent=0 acks=0 reconnAttempts=%d reconnSucc=%d serverErrors=%d", + qwp.AckedFsn(), + qwp.TotalReconnectAttempts(), + qwp.TotalReconnectsSucceeded(), + qwp.TotalServerErrors())) + } else { + reply("OK acked=-1 sent=0 acks=0 reconnAttempts=0 reconnSucc=0 serverErrors=0") + } + + case "CLOSE": + if sender != nil { + closeSender(sender) + sender = nil + } + reply("OK") + + case "EXIT": + if sender != nil { + closeSender(sender) + sender = nil + } + reply("OK") + return + + default: + reply("ERR unknown verb: " + verb) + } + } + + if sender != nil { + closeSender(sender) + } +} + +func closeSender(s qdb.LineSender) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = s.Close(ctx) +} + +func reply(msg string) { + fmt.Println(msg) +} diff --git a/system_test/enterprise_e2e/tests/__init__.py b/system_test/enterprise_e2e/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/system_test/enterprise_e2e/tests/test_go_client.py b/system_test/enterprise_e2e/tests/test_go_client.py new file mode 100644 index 00000000..7e36669f --- /dev/null +++ b/system_test/enterprise_e2e/tests/test_go_client.py @@ -0,0 +1,202 @@ +""" +Deterministic failover tests for the Go QWiP client against +QuestDB Enterprise. + +Each test mirrors the pattern in questdb-ent/e2e/tests/test_failover.py: +start a primary, send rows via the Go sidecar, kill -9 the primary, +start a successor, and verify no rows were lost. +""" + +from __future__ import annotations + +import logging +import shutil +import time +from pathlib import Path + +import pytest + +from lib.obj_store import ObjStore +from lib.pg_query import wait_for_dense_sequence +from lib.server import wait_port_free + +LOG = logging.getLogger(__name__) + + +def _connect_string(http_port: int, sf_dir: Path, *, + reconnect_max_ms: int = 60_000, + close_flush_timeout_ms: int = 5_000) -> str: + parts = [ + f"ws::addr=127.0.0.1:{http_port}", + "username=admin", + "password=quest", + f"sf_dir={sf_dir}", + f"reconnect_max_duration_millis={reconnect_max_ms}", + f"close_flush_timeout_millis={close_flush_timeout_ms}", + ] + return ";".join(parts) + ";" + + +@pytest.mark.go_client +@pytest.mark.xfail(reason="request_durable_ack=on not yet implemented in Go client") +def test_kill9_primary_failover_no_data_loss(server_factory, go_sidecar, + obj_store: ObjStore, scenario_dir: Path) -> None: + """Kill -9 P1 mid-flight, verify P2 has every row.""" + table = "go_trades_failover" + row_count = 50 + sf_dir = scenario_dir / "sf" + + p1 = server_factory("p1") + p1_ports = p1.start() + + go_sidecar.connect(_connect_string(p1_ports.http, sf_dir)) + go_sidecar.send(table, count=row_count, start_index=0) + go_sidecar.flush() + + time.sleep(0.5) + + p1.kill_9() + wait_port_free(p1_ports.http) + wait_port_free(p1_ports.pg) + + if p1.db_root.exists(): + shutil.rmtree(p1.db_root) + obj_store.wipe() + + p2 = server_factory("p2", db_root_name="p2-fresh") + p2.start(http_port=p1_ports.http, pg_port=p1_ports.pg) + + wait_for_dense_sequence(port=p1_ports.pg, table=table, + expected_count=row_count, timeout_s=60.0) + + +@pytest.mark.go_client +@pytest.mark.xfail(reason="request_durable_ack=on not yet implemented in Go client") +def test_failover_during_active_send(server_factory, go_sidecar, + obj_store: ObjStore, scenario_dir: Path) -> None: + """Kill P1 while the sender is still pushing batches.""" + table = "go_trades_inflight" + sf_dir = scenario_dir / "sf" + batches = 5 + rows_per_batch = 20 + expected = batches * rows_per_batch + + p1 = server_factory("p1") + p1_ports = p1.start() + go_sidecar.connect(_connect_string(p1_ports.http, sf_dir)) + + go_sidecar.send(table, count=rows_per_batch, start_index=0) + go_sidecar.flush() + for i in range(1, batches): + go_sidecar.send(table, count=rows_per_batch, start_index=i * rows_per_batch) + + p1.kill_9() + wait_port_free(p1_ports.http) + wait_port_free(p1_ports.pg) + + if p1.db_root.exists(): + shutil.rmtree(p1.db_root) + obj_store.wipe() + + p2 = server_factory("p2", db_root_name="p2-fresh") + p2.start(http_port=p1_ports.http, pg_port=p1_ports.pg) + + go_sidecar.flush() + + wait_for_dense_sequence(port=p1_ports.pg, table=table, + expected_count=expected, timeout_s=60.0) + + +@pytest.mark.go_client +@pytest.mark.xfail(reason="request_durable_ack=on not yet implemented in Go client") +def test_two_failovers_in_one_scenario(server_factory, go_sidecar, + obj_store: ObjStore, scenario_dir: Path) -> None: + """Multiple failovers in a row — no row should be lost.""" + table = "go_trades_two_fail" + sf_dir = scenario_dir / "sf" + rows_per_phase = 25 + expected = rows_per_phase * 3 + + # Phase 1. + p1 = server_factory("p1") + p1_ports = p1.start() + go_sidecar.connect(_connect_string(p1_ports.http, sf_dir)) + go_sidecar.send(table, count=rows_per_phase, start_index=0) + go_sidecar.flush() + time.sleep(0.5) + p1.kill_9() + wait_port_free(p1_ports.http) + wait_port_free(p1_ports.pg) + if p1.db_root.exists(): + shutil.rmtree(p1.db_root) + obj_store.wipe() + + # Phase 2. + p2 = server_factory("p2", db_root_name="p2-fresh") + p2.start(http_port=p1_ports.http, pg_port=p1_ports.pg) + go_sidecar.send(table, count=rows_per_phase, start_index=rows_per_phase) + go_sidecar.flush() + time.sleep(0.5) + p2.kill_9() + wait_port_free(p1_ports.http) + wait_port_free(p1_ports.pg) + if p2.db_root.exists(): + shutil.rmtree(p2.db_root) + obj_store.wipe() + + # Phase 3. + p3 = server_factory("p3", db_root_name="p3-fresh") + p3.start(http_port=p1_ports.http, pg_port=p1_ports.pg) + go_sidecar.send(table, count=rows_per_phase, start_index=rows_per_phase * 2) + go_sidecar.flush() + + wait_for_dense_sequence(port=p1_ports.pg, table=table, + expected_count=expected, timeout_s=90.0) + + +@pytest.mark.go_client +def test_ok_trim_loses_rows_without_durable_ack(server_factory, go_sidecar, + obj_store: ObjStore, scenario_dir: Path) -> None: + """Go client doesn't support durable-ack yet; SF trims on OK. Killing + P1 between OK and WAL upload, then wiping everything, should lose rows. + This is the expected negative case that proves the harness works.""" + table = "go_trades_no_durable" + sf_dir = scenario_dir / "sf" + row_count = 50 + + p1 = server_factory("p1") + p1_ports = p1.start() + + go_sidecar.connect(_connect_string(p1_ports.http, sf_dir)) + go_sidecar.send(table, count=row_count, start_index=0) + fsn = go_sidecar.flush() + go_sidecar.await_acked(fsn, timeout_ms=30_000) + + p1.kill_9() + wait_port_free(p1_ports.http) + wait_port_free(p1_ports.pg) + + if p1.db_root.exists(): + shutil.rmtree(p1.db_root) + obj_store.wipe() + + p2 = server_factory("p2", db_root_name="p2-fresh") + p2_ports = p2.start(http_port=p1_ports.http, pg_port=p1_ports.pg) + + time.sleep(5) + + import psycopg + try: + conn = psycopg.connect( + f"host=127.0.0.1 port={p2_ports.pg} user=admin password=quest dbname=qdb", + autocommit=True, + ) + cur = conn.execute(f"SELECT count() FROM '{table}'") + actual = cur.fetchone()[0] + conn.close() + except Exception: + actual = 0 + + assert actual < row_count, ( + f"Expected data loss without durable-ack but got {actual}/{row_count} rows" + ) diff --git a/tcp_sender_test.go b/tcp_sender_test.go index 8980abb3..f3d99236 100644 --- a/tcp_sender_test.go +++ b/tcp_sender_test.go @@ -27,12 +27,14 @@ package questdb_test import ( "context" "fmt" + "net" "os" "testing" "time" qdb "github.com/questdb/go-questdb-client/v4" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) const ( @@ -263,7 +265,16 @@ func TestErrorOnFlushWhenMessageIsPending(t *testing.T) { func TestErrorOnUnavailableServer(t *testing.T) { ctx := context.Background() - _, err := qdb.NewLineSender(ctx, qdb.WithTcp()) + // Reserve a free port and immediately release it. The default TCP + // address (127.0.0.1:9009) is QuestDB's standard ILP port, so on + // any developer machine running QuestDB locally the dial would + // succeed and this test would falsely fail. + l, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + addr := l.Addr().String() + require.NoError(t, l.Close()) + + _, err = qdb.NewLineSender(ctx, qdb.WithTcp(), qdb.WithAddress(addr)) assert.ErrorContains(t, err, "failed to connect to server") } diff --git a/utils_test.go b/utils_test.go index a0e6cc1f..8662c1cf 100644 --- a/utils_test.go +++ b/utils_test.go @@ -34,13 +34,13 @@ import ( "net" "net/http" "reflect" + "slices" "sync" "sync/atomic" "testing" "time" "github.com/stretchr/testify/assert" - "golang.org/x/exp/slices" ) type serverType int64