From b7969a47e177680fed1c55efd9ecbc5da9c10eea Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Wed, 6 May 2026 10:36:43 -0400 Subject: [PATCH 1/9] Send Sparkplug rebirth on first PUBLISH per session Wires the Sparkplug B "Node Control/Rebirth = true" command into MQTTLogger so the thermostat dumps full state automatically on connect. Triggered by the first qualifying PUBLISH (the same condition that already populates liveClients) rather than the raw CONNECT, because the firmware needs the session fully set up before it will act on NCMD traffic. A single CT_BOOL=true on Node Control/Rebirth provokes ~2200 config entries within ~17s, including the full schedule and per-activity setpoints that incremental deltas alone never republish. The rebirth-sent gate is reset on every CONNECT so reconnects re-trigger. --- cmd/anantha/cmd/serve.go | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index 78218c9..e40e9b2 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -50,6 +50,7 @@ type MQTTLogger struct { iotMQTTClient mqtt_paho.Client clientID string thingNameOverride string + cmdTopic string subscribedTopics map[string]struct{} subscribedTopicsLock sync.Mutex @@ -59,6 +60,38 @@ type MQTTLogger struct { loadedValues *LoadedValues liveClients map[string]struct{} + + rebirthSent map[string]struct{} + rebirthSentLock sync.Mutex +} + +// sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on +// the NCMD topic. The Carrier firmware honors this: it dumps full state +// (schedule, activity setpoints, ~2200 entries) within ~17 seconds. +func (m *MQTTLogger) sendRebirth() { + msg := &carrier.CarrierInfo{ + TimestampMillis: time.Now().UnixMilli(), + ConfigSettings: []*carrier.ConfigSetting{ + { + Name: "Node Control/Rebirth", + ConfigType: carrier.ConfigType_CT_BOOL, + Value: &carrier.ConfigSetting_BoolValue{ + BoolValue: true, + }, + }, + }, + Uuid: uuid.New().String(), + } + encoded, err := proto.Marshal(msg) + if err != nil { + log.Printf("Failed to encode rebirth proto: %s", err) + return + } + if err := m.server.Publish(m.cmdTopic, encoded, false, 0); err != nil { + log.Printf("Failed to send rebirth: %s", err) + return + } + log.Printf("Sent Node Control/Rebirth to %s", m.cmdTopic) } // ID returns the ID of the hook. @@ -96,6 +129,16 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P (m.thingNameOverride == "" && strings.HasSuffix(pk.TopicName, m.clientID)) { // Client sent initial PUBLISH - ready to poll it m.liveClients[cl.ID] = struct{}{} + + m.rebirthSentLock.Lock() + _, alreadySent := m.rebirthSent[cl.ID] + if !alreadySent { + m.rebirthSent[cl.ID] = struct{}{} + } + m.rebirthSentLock.Unlock() + if !alreadySent { + m.sendRebirth() + } } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -168,6 +211,13 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} + // Reset the rebirth-sent gate so the next qualifying PUBLISH triggers a + // fresh rebirth for this new session. CONNECT is the canonical "new + // session" signal — more reliable than DISCONNECT, which won't fire on + // abrupt drops (power loss, wifi flap with no clean LWT). + m.rebirthSentLock.Lock() + m.rebirthSent = map[string]struct{}{} + m.rebirthSentLock.Unlock() case packets.Pingreq: // Don't log PINGREQ default: @@ -1048,9 +1098,11 @@ func runServe(cmd *cobra.Command, args []string) error { iotMQTTClient: awsIOTMQTTClient, clientID: clientID, thingNameOverride: thingNameOverride, + cmdTopic: cmdTopic, subscribedTopics: make(map[string]struct{}), loadedValues: loadedValues, liveClients: make(map[string]struct{}), + rebirthSent: make(map[string]struct{}), } if err := server.AddHook(mLogger, nil); err != nil { From c4de4a8c48b72d7d28a05a68d0dc0e295e542689 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Wed, 6 May 2026 11:44:10 -0400 Subject: [PATCH 2/9] Delay rebirth 120s after first PUBLISH The thermostat firmware silently drops Sparkplug rebirth requests received during its own NBIRTH window after CONNECT. Firing immediately on the first qualifying PUBLISH (the previous behavior) consistently produced no observable response. Empirically, a rebirth request fired ~30s after the first PUBLISH still gets dropped, while one fired ~90s in succeeds and triggers the full ~2200-entry state dump that this feature is meant to recover. Schedule the rebirth for 120s after the first qualifying PUBLISH per session, with the timer bound to a context.CancelFunc so a CONNECT during the wait can interrupt it cleanly. The schedule call is idempotent per client, so subsequent PUBLISHes during the wait are no-ops, and CONNECT cancels any in-flight timer before resetting the map so reconnects re-schedule. The fire goroutine removes its own map entry before publishing to avoid a cancel-race when CONNECT arrives at the same instant the timer expires. --- cmd/anantha/cmd/serve.go | 73 +++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index e40e9b2..1c8af32 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -1,6 +1,7 @@ package cmd import ( + "context" "crypto/tls" "embed" "errors" @@ -61,8 +62,9 @@ type MQTTLogger struct { liveClients map[string]struct{} - rebirthSent map[string]struct{} - rebirthSentLock sync.Mutex + rebirthCancels map[string]context.CancelFunc + rebirthLock sync.Mutex + rebirthDelay time.Duration } // sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on @@ -94,6 +96,42 @@ func (m *MQTTLogger) sendRebirth() { log.Printf("Sent Node Control/Rebirth to %s", m.cmdTopic) } +// scheduleRebirth arranges for sendRebirth to fire after rebirthDelay. The +// firmware silently drops rebirth requests received during its own NBIRTH +// window (~T+30s after CONNECT); 120s clears that with margin. Idempotent per +// client: subsequent calls during the wait are no-ops. Cancellable on CONNECT +// via the stored cancel func. +func (m *MQTTLogger) scheduleRebirth(clientID string) { + m.rebirthLock.Lock() + if _, exists := m.rebirthCancels[clientID]; exists { + m.rebirthLock.Unlock() + return + } + ctx, cancel := context.WithCancel(context.Background()) + m.rebirthCancels[clientID] = cancel + m.rebirthLock.Unlock() + + log.Printf("Scheduled Node Control/Rebirth in %s for %s", m.rebirthDelay, clientID) + + go func() { + select { + case <-ctx.Done(): + log.Printf("Cancelled scheduled rebirth for %s", clientID) + return + case <-time.After(m.rebirthDelay): + } + + // Remove ourselves from the map BEFORE publishing so a CONNECT racing + // with us doesn't try to cancel an already-completed timer. Calling + // cancel() on an already-completed context is harmless either way. + m.rebirthLock.Lock() + delete(m.rebirthCancels, clientID) + m.rebirthLock.Unlock() + + m.sendRebirth() + }() +} + // ID returns the ID of the hook. func (m *MQTTLogger) ID() string { return "logger" @@ -129,16 +167,7 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P (m.thingNameOverride == "" && strings.HasSuffix(pk.TopicName, m.clientID)) { // Client sent initial PUBLISH - ready to poll it m.liveClients[cl.ID] = struct{}{} - - m.rebirthSentLock.Lock() - _, alreadySent := m.rebirthSent[cl.ID] - if !alreadySent { - m.rebirthSent[cl.ID] = struct{}{} - } - m.rebirthSentLock.Unlock() - if !alreadySent { - m.sendRebirth() - } + m.scheduleRebirth(cl.ID) } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -211,13 +240,16 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} - // Reset the rebirth-sent gate so the next qualifying PUBLISH triggers a - // fresh rebirth for this new session. CONNECT is the canonical "new - // session" signal — more reliable than DISCONNECT, which won't fire on - // abrupt drops (power loss, wifi flap with no clean LWT). - m.rebirthSentLock.Lock() - m.rebirthSent = map[string]struct{}{} - m.rebirthSentLock.Unlock() + // Cancel any in-flight rebirth timers from a prior session, then reset + // the map so the next qualifying PUBLISH re-schedules. CONNECT is the + // canonical "new session" signal - more reliable than DISCONNECT, which + // won't fire on abrupt drops (power loss, wifi flap with no clean LWT). + m.rebirthLock.Lock() + for _, cancel := range m.rebirthCancels { + cancel() + } + m.rebirthCancels = map[string]context.CancelFunc{} + m.rebirthLock.Unlock() case packets.Pingreq: // Don't log PINGREQ default: @@ -1102,7 +1134,8 @@ func runServe(cmd *cobra.Command, args []string) error { subscribedTopics: make(map[string]struct{}), loadedValues: loadedValues, liveClients: make(map[string]struct{}), - rebirthSent: make(map[string]struct{}), + rebirthCancels: make(map[string]context.CancelFunc), + rebirthDelay: 120 * time.Second, } if err := server.AddHook(mLogger, nil); err != nil { From 0d872c42f0b4b9269a8a5c7319e370037ce309f0 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Wed, 6 May 2026 11:59:43 -0400 Subject: [PATCH 3/9] Make rebirth gate sticky per session The firmware responds to a successful rebirth by sending a 30 KB NBIRTH publish on the topic suffix that matches our trigger condition (e.g. spBv1.0/WallCtrl/NBIRTH/). With only the in-flight cancel map as a gate, that response was treated as a fresh "first qualifying PUBLISH" and re-scheduled the rebirth, producing an indefinite ~4.5-minute cycle of rebirth -> NBIRTH -> reschedule -> rebirth. Add a separate rebirthFired set that the goroutine populates after the publish completes. scheduleRebirth checks both rebirthFired and rebirthCancels and bails on either. Only CONNECT clears the fired set, matching the "once per session" semantic the original gate intended. --- cmd/anantha/cmd/serve.go | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index 1c8af32..e6b934b 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -63,6 +63,7 @@ type MQTTLogger struct { liveClients map[string]struct{} rebirthCancels map[string]context.CancelFunc + rebirthFired map[string]struct{} rebirthLock sync.Mutex rebirthDelay time.Duration } @@ -98,12 +99,18 @@ func (m *MQTTLogger) sendRebirth() { // scheduleRebirth arranges for sendRebirth to fire after rebirthDelay. The // firmware silently drops rebirth requests received during its own NBIRTH -// window (~T+30s after CONNECT); 120s clears that with margin. Idempotent per -// client: subsequent calls during the wait are no-ops. Cancellable on CONNECT -// via the stored cancel func. +// window (~T+30s after CONNECT); 120s clears that with margin. The gate is +// sticky for the session: once fired, subsequent qualifying PUBLISHes (notably +// the firmware's NBIRTH response to our rebirth, which would otherwise re- +// trigger us in a loop) are no-ops until CONNECT clears the state. +// Cancellable mid-wait via the stored cancel func. func (m *MQTTLogger) scheduleRebirth(clientID string) { m.rebirthLock.Lock() - if _, exists := m.rebirthCancels[clientID]; exists { + if _, fired := m.rebirthFired[clientID]; fired { + m.rebirthLock.Unlock() + return + } + if _, scheduled := m.rebirthCancels[clientID]; scheduled { m.rebirthLock.Unlock() return } @@ -121,11 +128,14 @@ func (m *MQTTLogger) scheduleRebirth(clientID string) { case <-time.After(m.rebirthDelay): } - // Remove ourselves from the map BEFORE publishing so a CONNECT racing - // with us doesn't try to cancel an already-completed timer. Calling - // cancel() on an already-completed context is harmless either way. + // Mark fired and remove from the cancel map atomically before publishing, + // so any qualifying PUBLISH that arrives in response to the rebirth (e.g. + // the firmware's NBIRTH) sees rebirthFired[clientID] and skips re-scheduling. + // Calling cancel() on an already-completed context is harmless, so a CONNECT + // racing with us cannot misfire. m.rebirthLock.Lock() delete(m.rebirthCancels, clientID) + m.rebirthFired[clientID] = struct{}{} m.rebirthLock.Unlock() m.sendRebirth() @@ -240,8 +250,8 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} - // Cancel any in-flight rebirth timers from a prior session, then reset - // the map so the next qualifying PUBLISH re-schedules. CONNECT is the + // Cancel any in-flight rebirth timers from a prior session and clear the + // fired set so the next qualifying PUBLISH re-schedules. CONNECT is the // canonical "new session" signal - more reliable than DISCONNECT, which // won't fire on abrupt drops (power loss, wifi flap with no clean LWT). m.rebirthLock.Lock() @@ -249,6 +259,7 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P cancel() } m.rebirthCancels = map[string]context.CancelFunc{} + m.rebirthFired = map[string]struct{}{} m.rebirthLock.Unlock() case packets.Pingreq: // Don't log PINGREQ @@ -1135,6 +1146,7 @@ func runServe(cmd *cobra.Command, args []string) error { loadedValues: loadedValues, liveClients: make(map[string]struct{}), rebirthCancels: make(map[string]context.CancelFunc), + rebirthFired: make(map[string]struct{}), rebirthDelay: 120 * time.Second, } From f51770b4ac18e7ff72b6fd1a6c8c9632997464ea Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Wed, 6 May 2026 12:46:38 -0400 Subject: [PATCH 4/9] Document rebirth-on-connect in README and TODO Add a Features bullet for automatic state synchronization, a Technical Details subsection explaining the delta-vs-snapshot problem and the 120s-delayed Sparkplug Node Control/Rebirth workaround, and a TODO checkmark recording the feature as shipped. Also add SYSTXCCITC01-C / v2.00 to the compatibility matrix as a new known-working configuration. --- README.md | 10 ++++++++++ TODO.md | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 347dd45..da06b12 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Set your thermostat's DNS Server to Anantha's IP address. - HTTP server (ports 80, 443) for firmware updates and requests - Web dashboard (port 26268 - spells "ANANT" on a phone keypad) for debugging - Home Assistant integration with auto-discovery for controlling the thermostat +- Automatic full-state synchronization on (re)connect via a Sparkplug B `Node Control/Rebirth` command - Optional MQTT proxying to AWS IoT (requires additional setup) ## Debugging @@ -86,6 +87,7 @@ Known working devices and firmware versions: | Carrier | SYSTXCCITC01-B | v4.47 | | Carrier | SYSTXCCITC01-B | v4.56 | | Carrier | SYSTXCCITC01-B | v4.74 | +| Carrier | SYSTXCCITC01-C | v2.00 | ## Technical Details @@ -94,3 +96,11 @@ Since firmware v4.17, Carrier thermostats use AWS IoT for MQTT communication. Un For sparse details about the certificate generation process, see the `cmd/cagen` directory. For a barebones proto definition of the communication over MQTT, see protobuf definitions in the `proto` directory. + +### State synchronization + +The thermostat speaks a Sparkplug B-shaped protocol over MQTT and publishes deltas, not full snapshots, after its initial connect. Without intervention, a freshly-started Anantha (no on-disk proto cache) would render an empty `/schedule` and mostly-empty `/profiles` until the user manually edited every field on the thermostat. Wifi cycling on the thermostat does not help. + +Anantha works around this by sending a `Node Control/Rebirth = true` (CT_BOOL) command to `spBv1.0/WallCtrl/NCMD/` 120 seconds after the first qualifying PUBLISH from the thermostat. The firmware honors this rebirth request and republishes its full state (~2200 entries: schedule, activity setpoints, sensor templates, system info), producing a fully-populated dashboard within roughly 3.5 minutes of a cold start. The 120-second delay is necessary because the firmware silently drops rebirth requests received during its own NBIRTH announcement window. + +The rebirth fires once per session and is reset on every CONNECT, so reconnects re-trigger it. Mid-session firings are safe no-ops if state is already in sync. diff --git a/TODO.md b/TODO.md index 2a8e3ad..410e192 100644 --- a/TODO.md +++ b/TODO.md @@ -17,6 +17,12 @@ - [x] Better proto cleanup. We dump protobufs sent by thermostat in a directory which gets garbage collected on process startup. Do this on a schedule or as required. - [x] Weather integration with Open Meteo? Currently, we pretend to be in a California summer all year round. +- [x] Automatic full-state recovery on (re)connect. The thermostat only publishes deltas after CONNECT, so a fresh anantha + install (or one with an empty proto cache) used to render an empty schedule and profiles until the user manually edited + every field on the thermostat. Anantha now sends a Sparkplug B "Node Control/Rebirth" command 120s after the first + qualifying PUBLISH, which causes the firmware to dump its full state (~2200 entries) and produces a complete dashboard + within ~3.5 minutes of cold start. The 120s delay is necessary because the firmware silently drops rebirth requests + received during its own NBIRTH window. - [ ] Perform firmware patching via auto-update mechanism within the thermostat. Could be a way to onboard without needing an SD-card to flash firmware. Kind of dangerous given how some bits in the thermostat cannot be overwritten once set (like AWS IOT thingname, certs etc?). Could cause the thermostat to potentially get "bricked" if you want to use AWS IOT/Carrier API again. From 902c8ce7db6d500ad1ddab711a39cb202eb67680 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Mon, 11 May 2026 13:06:18 -0400 Subject: [PATCH 5/9] Revert auto-rebirth-on-connect timer mechanism In response to maintainer feedback on PR #17, switch from automatic-fire to user-initiated. This commit removes the timer infrastructure: the rebirthCancels and rebirthFired maps, the rebirthLock mutex, the rebirthDelay duration, the scheduleRebirth method, the OnPacketRead PUBLISH-branch invocation that scheduled it, and the CONNECT-branch reset that cleared it. The context import is also dropped. The sendRebirth helper and the cmdTopic field are preserved unchanged - they are the actual publish path for Node Control/Rebirth and will be reused by a /refresh-state HTTP handler in a follow-up commit. sendRebirth is currently unused after this commit so a one-off nolint:unused is added; the lint suppression goes away when the handler lands. The maintainer concern was a crash-loop scenario where anantha crashes on the rebirth response and the auto-fire restarts the loop on every recovery. A user-initiated button avoids that entirely while preserving the headline value of the PR. --- cmd/anantha/cmd/serve.go | 74 ++++------------------------------------ 1 file changed, 6 insertions(+), 68 deletions(-) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index e6b934b..b8faf56 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -1,7 +1,6 @@ package cmd import ( - "context" "crypto/tls" "embed" "errors" @@ -61,16 +60,15 @@ type MQTTLogger struct { loadedValues *LoadedValues liveClients map[string]struct{} - - rebirthCancels map[string]context.CancelFunc - rebirthFired map[string]struct{} - rebirthLock sync.Mutex - rebirthDelay time.Duration } // sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on -// the NCMD topic. The Carrier firmware honors this: it dumps full state -// (schedule, activity setpoints, ~2200 entries) within ~17 seconds. +// the NCMD topic. The Carrier firmware honors this and replies with a full +// NBIRTH/DBIRTH wave (~99 KB across 7 publishes over ~60 seconds), which +// repopulates schedule, activity setpoints, and other config from the +// thermostat. +// +//nolint:unused // wired up by the /refresh-state handler in a follow-up commit func (m *MQTTLogger) sendRebirth() { msg := &carrier.CarrierInfo{ TimestampMillis: time.Now().UnixMilli(), @@ -97,51 +95,6 @@ func (m *MQTTLogger) sendRebirth() { log.Printf("Sent Node Control/Rebirth to %s", m.cmdTopic) } -// scheduleRebirth arranges for sendRebirth to fire after rebirthDelay. The -// firmware silently drops rebirth requests received during its own NBIRTH -// window (~T+30s after CONNECT); 120s clears that with margin. The gate is -// sticky for the session: once fired, subsequent qualifying PUBLISHes (notably -// the firmware's NBIRTH response to our rebirth, which would otherwise re- -// trigger us in a loop) are no-ops until CONNECT clears the state. -// Cancellable mid-wait via the stored cancel func. -func (m *MQTTLogger) scheduleRebirth(clientID string) { - m.rebirthLock.Lock() - if _, fired := m.rebirthFired[clientID]; fired { - m.rebirthLock.Unlock() - return - } - if _, scheduled := m.rebirthCancels[clientID]; scheduled { - m.rebirthLock.Unlock() - return - } - ctx, cancel := context.WithCancel(context.Background()) - m.rebirthCancels[clientID] = cancel - m.rebirthLock.Unlock() - - log.Printf("Scheduled Node Control/Rebirth in %s for %s", m.rebirthDelay, clientID) - - go func() { - select { - case <-ctx.Done(): - log.Printf("Cancelled scheduled rebirth for %s", clientID) - return - case <-time.After(m.rebirthDelay): - } - - // Mark fired and remove from the cancel map atomically before publishing, - // so any qualifying PUBLISH that arrives in response to the rebirth (e.g. - // the firmware's NBIRTH) sees rebirthFired[clientID] and skips re-scheduling. - // Calling cancel() on an already-completed context is harmless, so a CONNECT - // racing with us cannot misfire. - m.rebirthLock.Lock() - delete(m.rebirthCancels, clientID) - m.rebirthFired[clientID] = struct{}{} - m.rebirthLock.Unlock() - - m.sendRebirth() - }() -} - // ID returns the ID of the hook. func (m *MQTTLogger) ID() string { return "logger" @@ -177,7 +130,6 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P (m.thingNameOverride == "" && strings.HasSuffix(pk.TopicName, m.clientID)) { // Client sent initial PUBLISH - ready to poll it m.liveClients[cl.ID] = struct{}{} - m.scheduleRebirth(cl.ID) } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -250,17 +202,6 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} - // Cancel any in-flight rebirth timers from a prior session and clear the - // fired set so the next qualifying PUBLISH re-schedules. CONNECT is the - // canonical "new session" signal - more reliable than DISCONNECT, which - // won't fire on abrupt drops (power loss, wifi flap with no clean LWT). - m.rebirthLock.Lock() - for _, cancel := range m.rebirthCancels { - cancel() - } - m.rebirthCancels = map[string]context.CancelFunc{} - m.rebirthFired = map[string]struct{}{} - m.rebirthLock.Unlock() case packets.Pingreq: // Don't log PINGREQ default: @@ -1145,9 +1086,6 @@ func runServe(cmd *cobra.Command, args []string) error { subscribedTopics: make(map[string]struct{}), loadedValues: loadedValues, liveClients: make(map[string]struct{}), - rebirthCancels: make(map[string]context.CancelFunc), - rebirthFired: make(map[string]struct{}), - rebirthDelay: 120 * time.Second, } if err := server.AddHook(mLogger, nil); err != nil { From 785f4d9329099893ce830d7af62b654a75082d90 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Mon, 11 May 2026 13:31:40 -0400 Subject: [PATCH 6/9] Add /refresh-state HTTP handler with cooldown and NBIRTH-window queue A user-initiated POST /refresh-state endpoint that publishes Node Control/Rebirth = true to the thermostat. Replaces the auto-fire-on-connect mechanism removed in the previous commit. Three states the handler can return: 1. Subcase A - thermostat not connected: liveClients is empty, so the publish would land at the broker with no subscriber. Short-circuit with a clear message. 2. Cooldown - last successful send was within 90 seconds. 90s is padded above the empirically observed ~59s response window from the rebirth experiments. Returns the remaining seconds. Cooldown survives MQTT reconnects so users cannot bypass it by power-cycling the thermostat. 3. Subcase B2 - thermostat connected but still in its NBIRTH window. Per the threshold experiment, the firmware silently drops rebirths received within the first ~120s of a new session. Set pendingRebirth=true; OnPacketRead PUBLISH branch fires it once the window clears. One-shot: a second click during the window replaces (not appends to) the queued state. Cleared on CONNECT (new session means the click was for the prior, gone session). Otherwise: fire sendRebirth() immediately, record rebirthLastSent for cooldown. State guarded by a new refreshMu mutex on MQTTLogger. mLogger is forward-declared in runServe so the web mux goroutine (which starts before mLogger is constructed) can capture it. The maintainer feedback on PR #17 (button instead of auto-fire) is addressed by this commit plus the upcoming UI commit. Bullet point #2 of the maintainer concern (crash loop on response) is fully avoided: pendingRebirth is in-memory only, so an anantha crash clears the queued state on restart and won't refire unless the user clicks again. --- cmd/anantha/cmd/serve.go | 98 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index b8faf56..67d13f3 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -60,6 +60,13 @@ type MQTTLogger struct { loadedValues *LoadedValues liveClients map[string]struct{} + + // State for the user-initiated /refresh-state button. All four fields are + // guarded by refreshMu. + refreshMu sync.Mutex + firstQualifyingPublishAt time.Time // zero when no qualifying PUBLISH seen this session + pendingRebirth bool // user clicked during the firmware NBIRTH window; fire when ready + rebirthLastSent time.Time // last successful sendRebirth, for cooldown } // sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on @@ -67,8 +74,6 @@ type MQTTLogger struct { // NBIRTH/DBIRTH wave (~99 KB across 7 publishes over ~60 seconds), which // repopulates schedule, activity setpoints, and other config from the // thermostat. -// -//nolint:unused // wired up by the /refresh-state handler in a follow-up commit func (m *MQTTLogger) sendRebirth() { msg := &carrier.CarrierInfo{ TimestampMillis: time.Now().UnixMilli(), @@ -130,6 +135,23 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P (m.thingNameOverride == "" && strings.HasSuffix(pk.TopicName, m.clientID)) { // Client sent initial PUBLISH - ready to poll it m.liveClients[cl.ID] = struct{}{} + + m.refreshMu.Lock() + if m.firstQualifyingPublishAt.IsZero() { + m.firstQualifyingPublishAt = time.Now() + } + // If a /refresh-state click queued a rebirth while we were still in + // the firmware's NBIRTH window, fire it now if enough time has passed. + shouldFirePending := m.pendingRebirth && time.Since(m.firstQualifyingPublishAt) >= 120*time.Second + if shouldFirePending { + m.pendingRebirth = false + m.rebirthLastSent = time.Now() + } + m.refreshMu.Unlock() + if shouldFirePending { + log.Printf("Firing queued Node Control/Rebirth (NBIRTH window cleared)") + m.sendRebirth() + } } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -202,6 +224,17 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} + + // New MQTT session - clear refresh-button state from the prior session. + // firstQualifyingPublishAt will be re-set on the next qualifying PUBLISH + // (the first NDATA on the thingname-suffixed topic). pendingRebirth was + // queued by a click against the old session, so it's no longer valid. + // rebirthLastSent intentionally survives the reconnect so the cooldown + // can't be bypassed by power-cycling the thermostat. + m.refreshMu.Lock() + m.firstQualifyingPublishAt = time.Time{} + m.pendingRebirth = false + m.refreshMu.Unlock() case packets.Pingreq: // Don't log PINGREQ default: @@ -478,6 +511,10 @@ func runServe(cmd *cobra.Command, args []string) error { cmdTopic = fmt.Sprintf("spBv1.0/WallCtrl/NCMD/%s", thingNameOverride) } + // Forward-declared so the web mux goroutine (which starts before mLogger + // is constructed below) can capture it for /refresh-state. + var mLogger *MQTTLogger + carrierHTTPMux := http.NewServeMux() carrierHTTPMux.HandleFunc("/Alive", func(w http.ResponseWriter, r *http.Request) { @@ -746,6 +783,61 @@ func runServe(cmd *cobra.Command, args []string) error { } fmt.Fprint(w, indexHTML) }) + webControlMux.HandleFunc("/refresh-state", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if mLogger == nil { + // MQTT server hasn't finished initializing yet. Brief window at startup. + fmt.Fprint(w, `Anantha is still starting up. Please try again in a few seconds.`) + return + } + + // Subcase A: thermostat is not connected to anantha. The publish would + // land at the broker with no subscriber and silently disappear, so + // short-circuit with a clear message. + if len(mLogger.liveClients) == 0 { + fmt.Fprint(w, `The thermostat is not connected to anantha yet. Please wait for it to publish before requesting a refresh.`) + return + } + + const cooldown = 90 * time.Second + const nbirthWindow = 120 * time.Second + + mLogger.refreshMu.Lock() + now := time.Now() + + // Cooldown check. + if !mLogger.rebirthLastSent.IsZero() { + if since := now.Sub(mLogger.rebirthLastSent); since < cooldown { + remaining := cooldown - since + mLogger.refreshMu.Unlock() + fmt.Fprintf(w, `A refresh was sent recently. Please wait %d seconds before refreshing again.`, int(remaining.Round(time.Second).Seconds())) + return + } + } + + // Subcase B: thermostat is connected but still in its NBIRTH window. + // Queue the click; the OnPacketRead PUBLISH branch will fire it once + // the window has cleared. + if !mLogger.firstQualifyingPublishAt.IsZero() { + if since := now.Sub(mLogger.firstQualifyingPublishAt); since < nbirthWindow { + remaining := nbirthWindow - since + mLogger.pendingRebirth = true + mLogger.refreshMu.Unlock() + log.Printf("Queued user-initiated rebirth; will fire in ~%s once NBIRTH window clears", remaining.Round(time.Second)) + fmt.Fprintf(w, `The thermostat just connected and is initializing. Your refresh will fire automatically in about %d seconds.`, int(remaining.Round(time.Second).Seconds())) + return + } + } + + // Normal path: fire immediately. + mLogger.rebirthLastSent = now + mLogger.refreshMu.Unlock() + mLogger.sendRebirth() + fmt.Fprint(w, `Refresh requested. Full state will arrive over the next ~60 seconds.`) + }) webControlMux.HandleFunc("/schedule", func(w http.ResponseWriter, r *http.Request) { scheduleHTML, err := RenderSchedule(loadedValues) if err != nil { @@ -1076,7 +1168,7 @@ func runServe(cmd *cobra.Command, args []string) error { })) level.Set(slog.LevelInfo) - mLogger := &MQTTLogger{ + mLogger = &MQTTLogger{ server: server, savedProtosDir: savedProtosDir, iotMQTTClient: awsIOTMQTTClient, From e6b2f8c96f9b5aceaabe8fa51c8bd53920c481c2 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Mon, 11 May 2026 14:02:17 -0400 Subject: [PATCH 7/9] Fix queued-rebirth fire mechanism: cooldown gate + AfterFunc timer Smoke test of commit 785f4d9 surfaced two bugs: 1. The pending-rebirth fire path in OnPacketRead's PUBLISH branch did not check the cooldown before firing. Live test sequence (logs at https://github.com/anupcshan/anantha/pull/17 review thread): 17:41:59 user click 1 - queued (NBIRTH window had ~76s remaining) 17:42:02 user click 2 - already queued 17:42:07 user click 3 - already queued 17:45:19 explicit click - cooldown ok, fires sendRebirth, sets rebirthLastSent 17:45:26 NBIRTH lands as response - PUBLISH branch fires the queued click again Result: two sendRebirth calls 7 seconds apart (the explicit click and the previously-queued one), bypassing the 90s cooldown. 2. The queue's fire trigger was the next qualifying node-level PUBLISH (NDATA/NBIRTH on a topic ending with the bare clientID). DDATA on sub-device topics doesn't qualify. In practice the cadence between qualifying PUBLISHes can be many minutes, so a queued rebirth would sit unfired far longer than the "will fire in N seconds" message implied. The 17:45 fire happened only by accident because the explicit click triggered an NBIRTH response that itself was a qualifying PUBLISH. This commit: - Adds a pendingRebirthTimer field guarded by refreshMu. The /refresh-state handler sets it via time.AfterFunc when entering the queue path; a re-click stops and replaces it; CONNECT stops it (the timer is for the prior session). - Adds firePendingRebirth as the timer callback. Re-checks pendingRebirth (CONNECT may have cleared it) and rebirthLastSent (an explicit click during the wait may have already fired). Drops the queued send if either guard says it's no longer needed. - Removes the OnPacketRead-driven fire path. The timer triggers on real elapsed time rather than on the firmware deciding to publish. The maintainer's stated concern about timer-shaped constructs (PR #17 review) is addressed: this timer is set in direct response to a user click, fires once per click, replaced by re-click, stopped by CONNECT, gated by cooldown. No automatic, recurring, or unconditional behavior - the click is the trigger. --- cmd/anantha/cmd/serve.go | 69 +++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index 67d13f3..919aec9 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -61,12 +61,43 @@ type MQTTLogger struct { liveClients map[string]struct{} - // State for the user-initiated /refresh-state button. All four fields are + // State for the user-initiated /refresh-state button. All five fields are // guarded by refreshMu. refreshMu sync.Mutex - firstQualifyingPublishAt time.Time // zero when no qualifying PUBLISH seen this session - pendingRebirth bool // user clicked during the firmware NBIRTH window; fire when ready - rebirthLastSent time.Time // last successful sendRebirth, for cooldown + firstQualifyingPublishAt time.Time // zero when no qualifying PUBLISH seen this session + pendingRebirth bool // user clicked during the firmware NBIRTH window; fire when ready + pendingRebirthTimer *time.Timer // wakes up firePendingRebirth when the NBIRTH window closes + rebirthLastSent time.Time // last successful sendRebirth, for cooldown +} + +// firePendingRebirth is invoked by pendingRebirthTimer when the NBIRTH window +// has closed. Re-checks state under the lock because CONNECT may have cleared +// pendingRebirth, and an explicit /refresh-state click may have updated +// rebirthLastSent during the wait. Does not log on no-op paths because they're +// expected operating conditions, not errors. +func (m *MQTTLogger) firePendingRebirth() { + const cooldown = 90 * time.Second + + m.refreshMu.Lock() + if !m.pendingRebirth { + // CONNECT cleared it (or another path already fired). + m.refreshMu.Unlock() + return + } + if !m.rebirthLastSent.IsZero() && time.Since(m.rebirthLastSent) < cooldown { + // An explicit click during our wait already fired a rebirth. The + // queued one would be redundant - drop it silently. + m.pendingRebirth = false + m.refreshMu.Unlock() + return + } + + m.pendingRebirth = false + m.rebirthLastSent = time.Now() + m.refreshMu.Unlock() + + log.Printf("Firing queued Node Control/Rebirth (NBIRTH window cleared)") + m.sendRebirth() } // sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on @@ -140,18 +171,7 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P if m.firstQualifyingPublishAt.IsZero() { m.firstQualifyingPublishAt = time.Now() } - // If a /refresh-state click queued a rebirth while we were still in - // the firmware's NBIRTH window, fire it now if enough time has passed. - shouldFirePending := m.pendingRebirth && time.Since(m.firstQualifyingPublishAt) >= 120*time.Second - if shouldFirePending { - m.pendingRebirth = false - m.rebirthLastSent = time.Now() - } m.refreshMu.Unlock() - if shouldFirePending { - log.Printf("Firing queued Node Control/Rebirth (NBIRTH window cleared)") - m.sendRebirth() - } } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -228,12 +248,17 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P // New MQTT session - clear refresh-button state from the prior session. // firstQualifyingPublishAt will be re-set on the next qualifying PUBLISH // (the first NDATA on the thingname-suffixed topic). pendingRebirth was - // queued by a click against the old session, so it's no longer valid. + // queued by a click against the old session, so it's no longer valid; + // stop its timer to avoid firing into the new session at the wrong time. // rebirthLastSent intentionally survives the reconnect so the cooldown // can't be bypassed by power-cycling the thermostat. m.refreshMu.Lock() m.firstQualifyingPublishAt = time.Time{} m.pendingRebirth = false + if m.pendingRebirthTimer != nil { + m.pendingRebirthTimer.Stop() + m.pendingRebirthTimer = nil + } m.refreshMu.Unlock() case packets.Pingreq: // Don't log PINGREQ @@ -819,12 +844,20 @@ func runServe(cmd *cobra.Command, args []string) error { } // Subcase B: thermostat is connected but still in its NBIRTH window. - // Queue the click; the OnPacketRead PUBLISH branch will fire it once - // the window has cleared. + // Queue the click and set a one-shot timer that wakes up when the + // window closes. A re-click during the wait stops the previous + // timer and starts a new one (still bounded to one fire). CONNECT + // stops the timer entirely since the click was for the prior session. if !mLogger.firstQualifyingPublishAt.IsZero() { if since := now.Sub(mLogger.firstQualifyingPublishAt); since < nbirthWindow { remaining := nbirthWindow - since mLogger.pendingRebirth = true + if mLogger.pendingRebirthTimer != nil { + mLogger.pendingRebirthTimer.Stop() + } + // Wait a small extra beat past the window so we're clearly + // past it (and so jitter doesn't put us right at the edge). + mLogger.pendingRebirthTimer = time.AfterFunc(remaining+2*time.Second, mLogger.firePendingRebirth) mLogger.refreshMu.Unlock() log.Printf("Queued user-initiated rebirth; will fire in ~%s once NBIRTH window clears", remaining.Round(time.Second)) fmt.Fprintf(w, `The thermostat just connected and is initializing. Your refresh will fire automatically in about %d seconds.`, int(remaining.Round(time.Second).Seconds())) From c141d54ef390465d26490f8f3a2e541bc9923813 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Mon, 11 May 2026 14:30:26 -0400 Subject: [PATCH 8/9] Add refresh-state button to dashboard The /refresh-state endpoint added in 785f4d9 had no UI. This wires it into the dashboard with a button placed under the "Last updated" row, and adds a small completeness heuristic that decides which informational text to render above it. stateLooksComplete checks node-level metrics (system mode/oat, wall control rt/rh, profile model/firmware/brand/serial) plus a per-active-zone schedule and activity check. Active zones are detected via live state metrics (rt/htsp/clsp), not the /enabled flag, because zone 1 on a single-zone install has no enabled field at all - the zone-shape findings are documented in dbirth-decoded/. The schedule check requires at least one fully-formed period per day rather than all five, so a thermostat with fewer configured periods isn't falsely flagged. The button is always clickable regardless of completeness; only the pre-text changes. Per maintainer feedback on PR #17, the goal is to inform the user about the state, not to gate the action. --- cmd/anantha/cmd/serve.go | 2 +- cmd/anantha/cmd/state_complete.go | 83 ++++++++++++++++++++++++++++ cmd/anantha/cmd/templates.go | 12 +++- cmd/anantha/cmd/templates/index.html | 39 +++++++++++++ 4 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 cmd/anantha/cmd/state_complete.go diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index 919aec9..1009b28 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -801,7 +801,7 @@ func runServe(cmd *cobra.Command, args []string) error { webControlMux.Handle("/metrics", MetricsHandler(loadedValues)) webControlMux.Handle("/assets/", http.FileServer(http.FS(assets))) webControlMux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - indexHTML, err := RenderIndex() + indexHTML, err := RenderIndex(loadedValues) if err != nil { http.Error(w, fmt.Sprintf("Error rendering index: %v", err), http.StatusInternalServerError) return diff --git a/cmd/anantha/cmd/state_complete.go b/cmd/anantha/cmd/state_complete.go new file mode 100644 index 0000000..100e7da --- /dev/null +++ b/cmd/anantha/cmd/state_complete.go @@ -0,0 +1,83 @@ +package cmd + +import "fmt" + +// stateLooksComplete returns true when LoadedValues looks like it has +// roughly the data a fully-populated install would carry: node-level +// metadata, plus per-active-zone schedule and activity coverage. Used by +// the index renderer to decide whether to suggest a refresh in the button +// pre-text. +// +// "Active zone" is detected via live state metrics (rt/htsp/clsp), not the +// /enabled flag. Zone 1 has no /enabled field at all on a single-zone +// install, but does have live state. Disabled zones (2-8 on a single-zone +// install) have schedule/activity definitions but no live state, so we skip +// them here. +// +// The heuristic is deliberately permissive: requiring "at least one period +// per day" rather than "all 5" so that a thermostat configured with fewer +// than 5 periods doesn't get falsely flagged as incomplete. +func stateLooksComplete(lv *LoadedValues) bool { + snap := lv.Snapshot() + + has := func(key string) bool { + _, ok := snap[key] + return ok + } + + // Node-level metrics that always exist on a fully-populated install. + nodeKeys := []string{ + "system/mode", "system/oat", + "sensor/wallControl/rt", "sensor/wallControl/rh", + "profile/model", "profile/firmware", "profile/brand", "profile/serial", + } + for _, k := range nodeKeys { + if !has(k) { + return false + } + } + + days := []string{"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"} + activities := []string{"home", "away", "sleep", "wake", "manual"} + + foundActiveZone := false + for n := 1; n <= 8; n++ { + zone := fmt.Sprintf("%d", n) + + // Active-zone signal: live state metrics present. + if !has(zone+"/rt") || !has(zone+"/htsp") || !has(zone+"/clsp") { + continue + } + foundActiveZone = true + + // Each day must have at least one fully-formed period. + for _, day := range days { + anyPeriod := false + for p := 1; p <= 5; p++ { + base := fmt.Sprintf("%s/program/%s/period %d", zone, day, p) + if has(base+"/time") && has(base+"/activity") && has(base+"/enabled") { + anyPeriod = true + break + } + } + if !anyPeriod { + return false + } + } + + // At least 4 of 5 activities must have htsp set. The 4-of-5 threshold + // allows for a user-disabled activity slot. + actCount := 0 + for _, a := range activities { + if has(fmt.Sprintf("%s/activities/%s/htsp", zone, a)) { + actCount++ + } + } + if actCount < 4 { + return false + } + } + + // Sanity: at least one zone must be active. + return foundActiveZone +} diff --git a/cmd/anantha/cmd/templates.go b/cmd/anantha/cmd/templates.go index 2e99ca2..766a44d 100644 --- a/cmd/anantha/cmd/templates.go +++ b/cmd/anantha/cmd/templates.go @@ -42,10 +42,18 @@ func init() { } } +// IndexData holds data for the index/dashboard template +type IndexData struct { + StateComplete bool +} + // RenderIndex renders the index/dashboard template -func RenderIndex() (string, error) { +func RenderIndex(loadedValues *LoadedValues) (string, error) { + data := IndexData{ + StateComplete: stateLooksComplete(loadedValues), + } var buf bytes.Buffer - if err := indexTemplate.Execute(&buf, nil); err != nil { + if err := indexTemplate.Execute(&buf, data); err != nil { return "", fmt.Errorf("failed to execute index template: %w", err) } return buf.String(), nil diff --git a/cmd/anantha/cmd/templates/index.html b/cmd/anantha/cmd/templates/index.html index 5041269..daaf691 100644 --- a/cmd/anantha/cmd/templates/index.html +++ b/cmd/anantha/cmd/templates/index.html @@ -216,6 +216,36 @@ .notifications-container > div:empty { display: none; } + + .refresh-state-row { + display: flex; + align-items: center; + gap: 12px; + flex-wrap: wrap; + margin: 10px 0 20px 0; + font-size: 0.9rem; + color: var(--secondary); + } + + .refresh-state-row button { + background-color: var(--primary); + color: white; + border: none; + padding: 8px 14px; + border-radius: var(--border-radius); + font-size: 0.9rem; + font-weight: 500; + cursor: pointer; + transition: var(--transition); + } + + .refresh-state-row button:hover { + background-color: var(--primary-dark); + } + + .refresh-state-row .error { + color: var(--danger); + } @@ -260,6 +290,15 @@

Anantha

Last updated: Never Connected +
+ {{if .StateComplete}} + Schedule and profile data are already loaded. You can refresh anyway: + {{else}} + Schedule and profile data look incomplete. Click to request a full state refresh from the thermostat. + {{end}} + + +
System Overview
From 5ded594bc46c02fdd810254498db030d59bcd913 Mon Sep 17 00:00:00 2001 From: Aevum Decessus Date: Mon, 11 May 2026 14:45:42 -0400 Subject: [PATCH 9/9] Document refresh-state button in README and TODO The previous wording described an automatic-on-connect rebirth, which is no longer how the feature works. Rewrite the feature bullet, the "State synchronization" subsection, and the matching TODO line to describe the button-based flow, the 90-second cooldown, and the NBIRTH-window queue path. --- README.md | 10 +++++++--- TODO.md | 11 ++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index da06b12..68da31e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Set your thermostat's DNS Server to Anantha's IP address. - HTTP server (ports 80, 443) for firmware updates and requests - Web dashboard (port 26268 - spells "ANANT" on a phone keypad) for debugging - Home Assistant integration with auto-discovery for controlling the thermostat -- Automatic full-state synchronization on (re)connect via a Sparkplug B `Node Control/Rebirth` command +- User-initiated full-state refresh button on the dashboard, which sends a Sparkplug B `Node Control/Rebirth` command to the thermostat - Optional MQTT proxying to AWS IoT (requires additional setup) ## Debugging @@ -101,6 +101,10 @@ For a barebones proto definition of the communication over MQTT, see protobuf de The thermostat speaks a Sparkplug B-shaped protocol over MQTT and publishes deltas, not full snapshots, after its initial connect. Without intervention, a freshly-started Anantha (no on-disk proto cache) would render an empty `/schedule` and mostly-empty `/profiles` until the user manually edited every field on the thermostat. Wifi cycling on the thermostat does not help. -Anantha works around this by sending a `Node Control/Rebirth = true` (CT_BOOL) command to `spBv1.0/WallCtrl/NCMD/` 120 seconds after the first qualifying PUBLISH from the thermostat. The firmware honors this rebirth request and republishes its full state (~2200 entries: schedule, activity setpoints, sensor templates, system info), producing a fully-populated dashboard within roughly 3.5 minutes of a cold start. The 120-second delay is necessary because the firmware silently drops rebirth requests received during its own NBIRTH announcement window. +The dashboard exposes a "Refresh thermostat state" button that sends a `Node Control/Rebirth = true` (CT_BOOL) command to `spBv1.0/WallCtrl/NCMD/`. The firmware honors this rebirth request and republishes its full state (~2200 entries: schedule, activity setpoints, sensor templates, system info), populating the dashboard within roughly a minute. The button's pre-text changes based on a per-page-load completeness check so the user can see whether a refresh is likely useful, but it remains clickable in either case. -The rebirth fires once per session and is reset on every CONNECT, so reconnects re-trigger it. Mid-session firings are safe no-ops if state is already in sync. +A few protections keep the firmware from being hammered: + +- Server-side cooldown of 90 seconds between successful sends (the response itself takes ~60 seconds to drain, so this leaves comfortable headroom). +- If the click arrives during the firmware's NBIRTH announcement window (the first 120 seconds after CONNECT, during which it silently drops rebirth requests), the click is queued and a one-shot timer fires the rebirth as soon as the window clears. +- If the thermostat has not yet published anything to Anantha, the click short-circuits with a clear message instead of sending into the void. diff --git a/TODO.md b/TODO.md index 410e192..b164654 100644 --- a/TODO.md +++ b/TODO.md @@ -17,12 +17,13 @@ - [x] Better proto cleanup. We dump protobufs sent by thermostat in a directory which gets garbage collected on process startup. Do this on a schedule or as required. - [x] Weather integration with Open Meteo? Currently, we pretend to be in a California summer all year round. -- [x] Automatic full-state recovery on (re)connect. The thermostat only publishes deltas after CONNECT, so a fresh anantha +- [x] Full-state recovery via dashboard button. The thermostat only publishes deltas after CONNECT, so a fresh anantha install (or one with an empty proto cache) used to render an empty schedule and profiles until the user manually edited - every field on the thermostat. Anantha now sends a Sparkplug B "Node Control/Rebirth" command 120s after the first - qualifying PUBLISH, which causes the firmware to dump its full state (~2200 entries) and produces a complete dashboard - within ~3.5 minutes of cold start. The 120s delay is necessary because the firmware silently drops rebirth requests - received during its own NBIRTH window. + every field on the thermostat. The dashboard now exposes a "Refresh thermostat state" button that sends a Sparkplug B + "Node Control/Rebirth" command, causing the firmware to dump its full state (~2200 entries) and producing a complete + dashboard within ~60 seconds of the click. A 90-second server-side cooldown prevents firmware spam, and clicks during + the firmware's 120-second NBIRTH window (when rebirth requests are silently dropped) are queued and fired automatically + once the window clears. - [ ] Perform firmware patching via auto-update mechanism within the thermostat. Could be a way to onboard without needing an SD-card to flash firmware. Kind of dangerous given how some bits in the thermostat cannot be overwritten once set (like AWS IOT thingname, certs etc?). Could cause the thermostat to potentially get "bricked" if you want to use AWS IOT/Carrier API again.