diff --git a/README.md b/README.md index 347dd45..68da31e 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Set your thermostat's DNS Server to Anantha's IP address. - HTTP server (ports 80, 443) for firmware updates and requests - Web dashboard (port 26268 - spells "ANANT" on a phone keypad) for debugging - Home Assistant integration with auto-discovery for controlling the thermostat +- User-initiated full-state refresh button on the dashboard, which sends a Sparkplug B `Node Control/Rebirth` command to the thermostat - Optional MQTT proxying to AWS IoT (requires additional setup) ## Debugging @@ -86,6 +87,7 @@ Known working devices and firmware versions: | Carrier | SYSTXCCITC01-B | v4.47 | | Carrier | SYSTXCCITC01-B | v4.56 | | Carrier | SYSTXCCITC01-B | v4.74 | +| Carrier | SYSTXCCITC01-C | v2.00 | ## Technical Details @@ -94,3 +96,15 @@ Since firmware v4.17, Carrier thermostats use AWS IoT for MQTT communication. Un For sparse details about the certificate generation process, see the `cmd/cagen` directory. For a barebones proto definition of the communication over MQTT, see protobuf definitions in the `proto` directory. + +### State synchronization + +The thermostat speaks a Sparkplug B-shaped protocol over MQTT and publishes deltas, not full snapshots, after its initial connect. Without intervention, a freshly-started Anantha (no on-disk proto cache) would render an empty `/schedule` and mostly-empty `/profiles` until the user manually edited every field on the thermostat. Wifi cycling on the thermostat does not help. + +The dashboard exposes a "Refresh thermostat state" button that sends a `Node Control/Rebirth = true` (CT_BOOL) command to `spBv1.0/WallCtrl/NCMD/`. The firmware honors this rebirth request and republishes its full state (~2200 entries: schedule, activity setpoints, sensor templates, system info), populating the dashboard within roughly a minute. The button's pre-text changes based on a per-page-load completeness check so the user can see whether a refresh is likely useful, but it remains clickable in either case. + +A few protections keep the firmware from being hammered: + +- Server-side cooldown of 90 seconds between successful sends (the response itself takes ~60 seconds to drain, so this leaves comfortable headroom). +- If the click arrives during the firmware's NBIRTH announcement window (the first 120 seconds after CONNECT, during which it silently drops rebirth requests), the click is queued and a one-shot timer fires the rebirth as soon as the window clears. +- If the thermostat has not yet published anything to Anantha, the click short-circuits with a clear message instead of sending into the void. diff --git a/TODO.md b/TODO.md index 2a8e3ad..b164654 100644 --- a/TODO.md +++ b/TODO.md @@ -17,6 +17,13 @@ - [x] Better proto cleanup. We dump protobufs sent by thermostat in a directory which gets garbage collected on process startup. Do this on a schedule or as required. - [x] Weather integration with Open Meteo? Currently, we pretend to be in a California summer all year round. +- [x] Full-state recovery via dashboard button. The thermostat only publishes deltas after CONNECT, so a fresh anantha + install (or one with an empty proto cache) used to render an empty schedule and profiles until the user manually edited + every field on the thermostat. The dashboard now exposes a "Refresh thermostat state" button that sends a Sparkplug B + "Node Control/Rebirth" command, causing the firmware to dump its full state (~2200 entries) and producing a complete + dashboard within ~60 seconds of the click. A 90-second server-side cooldown prevents firmware spam, and clicks during + the firmware's 120-second NBIRTH window (when rebirth requests are silently dropped) are queued and fired automatically + once the window clears. - [ ] Perform firmware patching via auto-update mechanism within the thermostat. Could be a way to onboard without needing an SD-card to flash firmware. Kind of dangerous given how some bits in the thermostat cannot be overwritten once set (like AWS IOT thingname, certs etc?). Could cause the thermostat to potentially get "bricked" if you want to use AWS IOT/Carrier API again. diff --git a/cmd/anantha/cmd/serve.go b/cmd/anantha/cmd/serve.go index 78218c9..1009b28 100644 --- a/cmd/anantha/cmd/serve.go +++ b/cmd/anantha/cmd/serve.go @@ -50,6 +50,7 @@ type MQTTLogger struct { iotMQTTClient mqtt_paho.Client clientID string thingNameOverride string + cmdTopic string subscribedTopics map[string]struct{} subscribedTopicsLock sync.Mutex @@ -59,6 +60,75 @@ type MQTTLogger struct { loadedValues *LoadedValues liveClients map[string]struct{} + + // State for the user-initiated /refresh-state button. All five fields are + // guarded by refreshMu. + refreshMu sync.Mutex + firstQualifyingPublishAt time.Time // zero when no qualifying PUBLISH seen this session + pendingRebirth bool // user clicked during the firmware NBIRTH window; fire when ready + pendingRebirthTimer *time.Timer // wakes up firePendingRebirth when the NBIRTH window closes + rebirthLastSent time.Time // last successful sendRebirth, for cooldown +} + +// firePendingRebirth is invoked by pendingRebirthTimer when the NBIRTH window +// has closed. Re-checks state under the lock because CONNECT may have cleared +// pendingRebirth, and an explicit /refresh-state click may have updated +// rebirthLastSent during the wait. Does not log on no-op paths because they're +// expected operating conditions, not errors. +func (m *MQTTLogger) firePendingRebirth() { + const cooldown = 90 * time.Second + + m.refreshMu.Lock() + if !m.pendingRebirth { + // CONNECT cleared it (or another path already fired). + m.refreshMu.Unlock() + return + } + if !m.rebirthLastSent.IsZero() && time.Since(m.rebirthLastSent) < cooldown { + // An explicit click during our wait already fired a rebirth. The + // queued one would be redundant - drop it silently. + m.pendingRebirth = false + m.refreshMu.Unlock() + return + } + + m.pendingRebirth = false + m.rebirthLastSent = time.Now() + m.refreshMu.Unlock() + + log.Printf("Firing queued Node Control/Rebirth (NBIRTH window cleared)") + m.sendRebirth() +} + +// sendRebirth publishes a Sparkplug B "Node Control/Rebirth" = true command on +// the NCMD topic. The Carrier firmware honors this and replies with a full +// NBIRTH/DBIRTH wave (~99 KB across 7 publishes over ~60 seconds), which +// repopulates schedule, activity setpoints, and other config from the +// thermostat. +func (m *MQTTLogger) sendRebirth() { + msg := &carrier.CarrierInfo{ + TimestampMillis: time.Now().UnixMilli(), + ConfigSettings: []*carrier.ConfigSetting{ + { + Name: "Node Control/Rebirth", + ConfigType: carrier.ConfigType_CT_BOOL, + Value: &carrier.ConfigSetting_BoolValue{ + BoolValue: true, + }, + }, + }, + Uuid: uuid.New().String(), + } + encoded, err := proto.Marshal(msg) + if err != nil { + log.Printf("Failed to encode rebirth proto: %s", err) + return + } + if err := m.server.Publish(m.cmdTopic, encoded, false, 0); err != nil { + log.Printf("Failed to send rebirth: %s", err) + return + } + log.Printf("Sent Node Control/Rebirth to %s", m.cmdTopic) } // ID returns the ID of the hook. @@ -96,6 +166,12 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P (m.thingNameOverride == "" && strings.HasSuffix(pk.TopicName, m.clientID)) { // Client sent initial PUBLISH - ready to poll it m.liveClients[cl.ID] = struct{}{} + + m.refreshMu.Lock() + if m.firstQualifyingPublishAt.IsZero() { + m.firstQualifyingPublishAt = time.Now() + } + m.refreshMu.Unlock() } protoFilename := fmt.Sprintf("%s-%s.pb", strings.ReplaceAll(string(pk.TopicName), "/", "_"), time.Now().Format(time.RFC3339Nano)) if err := os.WriteFile( @@ -168,6 +244,22 @@ func (m *MQTTLogger) OnPacketRead(cl *mqtt.Client, pk packets.Packet) (packets.P case packets.Connect: // Empty liveClients list on CONNECT. Make sure we get a PUBLISH spBv1.0/WallCtrl/NDATA/ before polling m.liveClients = map[string]struct{}{} + + // New MQTT session - clear refresh-button state from the prior session. + // firstQualifyingPublishAt will be re-set on the next qualifying PUBLISH + // (the first NDATA on the thingname-suffixed topic). pendingRebirth was + // queued by a click against the old session, so it's no longer valid; + // stop its timer to avoid firing into the new session at the wrong time. + // rebirthLastSent intentionally survives the reconnect so the cooldown + // can't be bypassed by power-cycling the thermostat. + m.refreshMu.Lock() + m.firstQualifyingPublishAt = time.Time{} + m.pendingRebirth = false + if m.pendingRebirthTimer != nil { + m.pendingRebirthTimer.Stop() + m.pendingRebirthTimer = nil + } + m.refreshMu.Unlock() case packets.Pingreq: // Don't log PINGREQ default: @@ -444,6 +536,10 @@ func runServe(cmd *cobra.Command, args []string) error { cmdTopic = fmt.Sprintf("spBv1.0/WallCtrl/NCMD/%s", thingNameOverride) } + // Forward-declared so the web mux goroutine (which starts before mLogger + // is constructed below) can capture it for /refresh-state. + var mLogger *MQTTLogger + carrierHTTPMux := http.NewServeMux() carrierHTTPMux.HandleFunc("/Alive", func(w http.ResponseWriter, r *http.Request) { @@ -705,13 +801,76 @@ func runServe(cmd *cobra.Command, args []string) error { webControlMux.Handle("/metrics", MetricsHandler(loadedValues)) webControlMux.Handle("/assets/", http.FileServer(http.FS(assets))) webControlMux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - indexHTML, err := RenderIndex() + indexHTML, err := RenderIndex(loadedValues) if err != nil { http.Error(w, fmt.Sprintf("Error rendering index: %v", err), http.StatusInternalServerError) return } fmt.Fprint(w, indexHTML) }) + webControlMux.HandleFunc("/refresh-state", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if mLogger == nil { + // MQTT server hasn't finished initializing yet. Brief window at startup. + fmt.Fprint(w, `Anantha is still starting up. Please try again in a few seconds.`) + return + } + + // Subcase A: thermostat is not connected to anantha. The publish would + // land at the broker with no subscriber and silently disappear, so + // short-circuit with a clear message. + if len(mLogger.liveClients) == 0 { + fmt.Fprint(w, `The thermostat is not connected to anantha yet. Please wait for it to publish before requesting a refresh.`) + return + } + + const cooldown = 90 * time.Second + const nbirthWindow = 120 * time.Second + + mLogger.refreshMu.Lock() + now := time.Now() + + // Cooldown check. + if !mLogger.rebirthLastSent.IsZero() { + if since := now.Sub(mLogger.rebirthLastSent); since < cooldown { + remaining := cooldown - since + mLogger.refreshMu.Unlock() + fmt.Fprintf(w, `A refresh was sent recently. Please wait %d seconds before refreshing again.`, int(remaining.Round(time.Second).Seconds())) + return + } + } + + // Subcase B: thermostat is connected but still in its NBIRTH window. + // Queue the click and set a one-shot timer that wakes up when the + // window closes. A re-click during the wait stops the previous + // timer and starts a new one (still bounded to one fire). CONNECT + // stops the timer entirely since the click was for the prior session. + if !mLogger.firstQualifyingPublishAt.IsZero() { + if since := now.Sub(mLogger.firstQualifyingPublishAt); since < nbirthWindow { + remaining := nbirthWindow - since + mLogger.pendingRebirth = true + if mLogger.pendingRebirthTimer != nil { + mLogger.pendingRebirthTimer.Stop() + } + // Wait a small extra beat past the window so we're clearly + // past it (and so jitter doesn't put us right at the edge). + mLogger.pendingRebirthTimer = time.AfterFunc(remaining+2*time.Second, mLogger.firePendingRebirth) + mLogger.refreshMu.Unlock() + log.Printf("Queued user-initiated rebirth; will fire in ~%s once NBIRTH window clears", remaining.Round(time.Second)) + fmt.Fprintf(w, `The thermostat just connected and is initializing. Your refresh will fire automatically in about %d seconds.`, int(remaining.Round(time.Second).Seconds())) + return + } + } + + // Normal path: fire immediately. + mLogger.rebirthLastSent = now + mLogger.refreshMu.Unlock() + mLogger.sendRebirth() + fmt.Fprint(w, `Refresh requested. Full state will arrive over the next ~60 seconds.`) + }) webControlMux.HandleFunc("/schedule", func(w http.ResponseWriter, r *http.Request) { scheduleHTML, err := RenderSchedule(loadedValues) if err != nil { @@ -1042,12 +1201,13 @@ func runServe(cmd *cobra.Command, args []string) error { })) level.Set(slog.LevelInfo) - mLogger := &MQTTLogger{ + mLogger = &MQTTLogger{ server: server, savedProtosDir: savedProtosDir, iotMQTTClient: awsIOTMQTTClient, clientID: clientID, thingNameOverride: thingNameOverride, + cmdTopic: cmdTopic, subscribedTopics: make(map[string]struct{}), loadedValues: loadedValues, liveClients: make(map[string]struct{}), diff --git a/cmd/anantha/cmd/state_complete.go b/cmd/anantha/cmd/state_complete.go new file mode 100644 index 0000000..100e7da --- /dev/null +++ b/cmd/anantha/cmd/state_complete.go @@ -0,0 +1,83 @@ +package cmd + +import "fmt" + +// stateLooksComplete returns true when LoadedValues looks like it has +// roughly the data a fully-populated install would carry: node-level +// metadata, plus per-active-zone schedule and activity coverage. Used by +// the index renderer to decide whether to suggest a refresh in the button +// pre-text. +// +// "Active zone" is detected via live state metrics (rt/htsp/clsp), not the +// /enabled flag. Zone 1 has no /enabled field at all on a single-zone +// install, but does have live state. Disabled zones (2-8 on a single-zone +// install) have schedule/activity definitions but no live state, so we skip +// them here. +// +// The heuristic is deliberately permissive: requiring "at least one period +// per day" rather than "all 5" so that a thermostat configured with fewer +// than 5 periods doesn't get falsely flagged as incomplete. +func stateLooksComplete(lv *LoadedValues) bool { + snap := lv.Snapshot() + + has := func(key string) bool { + _, ok := snap[key] + return ok + } + + // Node-level metrics that always exist on a fully-populated install. + nodeKeys := []string{ + "system/mode", "system/oat", + "sensor/wallControl/rt", "sensor/wallControl/rh", + "profile/model", "profile/firmware", "profile/brand", "profile/serial", + } + for _, k := range nodeKeys { + if !has(k) { + return false + } + } + + days := []string{"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"} + activities := []string{"home", "away", "sleep", "wake", "manual"} + + foundActiveZone := false + for n := 1; n <= 8; n++ { + zone := fmt.Sprintf("%d", n) + + // Active-zone signal: live state metrics present. + if !has(zone+"/rt") || !has(zone+"/htsp") || !has(zone+"/clsp") { + continue + } + foundActiveZone = true + + // Each day must have at least one fully-formed period. + for _, day := range days { + anyPeriod := false + for p := 1; p <= 5; p++ { + base := fmt.Sprintf("%s/program/%s/period %d", zone, day, p) + if has(base+"/time") && has(base+"/activity") && has(base+"/enabled") { + anyPeriod = true + break + } + } + if !anyPeriod { + return false + } + } + + // At least 4 of 5 activities must have htsp set. The 4-of-5 threshold + // allows for a user-disabled activity slot. + actCount := 0 + for _, a := range activities { + if has(fmt.Sprintf("%s/activities/%s/htsp", zone, a)) { + actCount++ + } + } + if actCount < 4 { + return false + } + } + + // Sanity: at least one zone must be active. + return foundActiveZone +} diff --git a/cmd/anantha/cmd/templates.go b/cmd/anantha/cmd/templates.go index 2e99ca2..766a44d 100644 --- a/cmd/anantha/cmd/templates.go +++ b/cmd/anantha/cmd/templates.go @@ -42,10 +42,18 @@ func init() { } } +// IndexData holds data for the index/dashboard template +type IndexData struct { + StateComplete bool +} + // RenderIndex renders the index/dashboard template -func RenderIndex() (string, error) { +func RenderIndex(loadedValues *LoadedValues) (string, error) { + data := IndexData{ + StateComplete: stateLooksComplete(loadedValues), + } var buf bytes.Buffer - if err := indexTemplate.Execute(&buf, nil); err != nil { + if err := indexTemplate.Execute(&buf, data); err != nil { return "", fmt.Errorf("failed to execute index template: %w", err) } return buf.String(), nil diff --git a/cmd/anantha/cmd/templates/index.html b/cmd/anantha/cmd/templates/index.html index 5041269..daaf691 100644 --- a/cmd/anantha/cmd/templates/index.html +++ b/cmd/anantha/cmd/templates/index.html @@ -216,6 +216,36 @@ .notifications-container > div:empty { display: none; } + + .refresh-state-row { + display: flex; + align-items: center; + gap: 12px; + flex-wrap: wrap; + margin: 10px 0 20px 0; + font-size: 0.9rem; + color: var(--secondary); + } + + .refresh-state-row button { + background-color: var(--primary); + color: white; + border: none; + padding: 8px 14px; + border-radius: var(--border-radius); + font-size: 0.9rem; + font-weight: 500; + cursor: pointer; + transition: var(--transition); + } + + .refresh-state-row button:hover { + background-color: var(--primary-dark); + } + + .refresh-state-row .error { + color: var(--danger); + } @@ -260,6 +290,15 @@

Anantha

Last updated: Never Connected +
+ {{if .StateComplete}} + Schedule and profile data are already loaded. You can refresh anyway: + {{else}} + Schedule and profile data look incomplete. Click to request a full state refresh from the thermostat. + {{end}} + + +
System Overview