diff --git a/docs/metrics.md b/docs/metrics.md index 5b7ef9c..4a60924 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,5 +1,7 @@ # Metrics reference +For example PromQL queries and alert recipes, see [promql-recipes.md](promql-recipes.md). + ## System (`topsrv_cpu_*`, `topsrv_memory_*`, `topsrv_load_*`, `topsrv_swap_*`) | Metric | Type | Labels | Description | @@ -57,30 +59,6 @@ | `topsrv_netstat_udp_sndbuf_errors_total` | counter | — | UDP send buffer errors | | `topsrv_netstat_ip_unknown_protos_total` | counter | — | IP datagrams with unknown protocol | -Useful queries: - -```promql -# Inventory of publicly-reachable listeners across the fleet -topsrv_netstat_listening_ports{scope="public"} - -# Same, UDP only — DNS / mDNS / WireGuard / unexpected UDP services -topsrv_netstat_listening_ports{proto="udp", scope="public"} - -# Alert: a new public-bound port appeared in the last 10 minutes -# (compare current set against the set seen 10 minutes ago — non-zero rows are new exposures) -(topsrv_netstat_listening_ports{scope="public"} == 1) - unless on (instance, proto, port, family) (topsrv_netstat_listening_ports{scope="public"} offset 10m == 1) - -# Count of distinct public-bound ports per host (capacity / drift watch) -count by (instance) (topsrv_netstat_listening_ports{scope="public"} == 1) - -# Established TCP connections from the public internet — scan / unexpected exposure signal -topsrv_netstat_tcp_connections{state="ESTABLISHED", direction="inbound", remote_scope="public"} - -# Outbound TCP to the public internet — exfil / unexpected egress watch -sum by (instance) (topsrv_netstat_tcp_connections{direction="outbound", remote_scope="public"}) -``` - ## Process (`topsrv_process_*`) | Metric | Type | Labels | Description | @@ -292,6 +270,17 @@ Metrics from Angie JSON API (`/status/`). Requires `api /status/;` directive in |--------|------|--------|-------------| | `topsrv_angie_slab_pages` | gauge | zone, state | Slab pages (used/free) | +### ACME clients + +Polled from `/status/http/acme_clients/?date=epoch` on angie 1.5+ when the `acme_client` directive is configured. Endpoint absent (404) silently — no metrics emitted on hosts without ACME. Per [angie http_acme docs](https://en.angie.software/angie/docs/configuration/modules/http/http_acme/) `state` ∈ {`ready`, `requesting`, `disabled`, `failed`}; `certificate` ∈ {`valid`, `expired`, `missing`, `mismatch`, `error`}. + +These metrics expose the ACME state machine. **Certificate expiry (NotAfter) for ACME-managed certs is covered by the same `topsrv_ssl_certificate_expiry_seconds` metric as static certs** — discovery scans `/var/lib/angie/acme//certificate.pem` for every `acme_client` directive and includes the file in the SSL collector. ACME and pinned certs end up indistinguishable in that metric (operator-controllable filter via `path` label). + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `topsrv_angie_acme_state` | gauge | name, state, certificate | `value=1` per active acme_client tuple. `name` is the directive's client name. `state` is the operator-facing state-machine value (`ready`, `renewing`, `error`, …); `certificate` is the cert validity (`valid`, `invalid`, `pending`, …). Exact enum surface evolves with angie releases — label values pass through verbatim. Alert on `certificate!="valid"` or `state="error"` | +| `topsrv_angie_acme_next_run_seconds` | gauge | name | Unix timestamp of the next scheduled action (renewal attempt / state-machine tick) for this client | + ## Bot-logs (`topsrv_botlog_*`) Opt-in. Emitted when `[BotLogs].Enabled = true`. The agent matches every parsed nginx access-log line against a built-in UA fingerprint table (38 families) and ships matched events as gzipped ndjson to the topsrv.io `/v1/bot-logs` endpoint, with disk-backed WAL spool for retry on transient send failures. diff --git a/docs/promql-recipes.md b/docs/promql-recipes.md new file mode 100644 index 0000000..1d10d2e --- /dev/null +++ b/docs/promql-recipes.md @@ -0,0 +1,44 @@ +# PromQL recipes + +Common queries, alerts, and dashboard expressions for topsrv metrics. Indexed by collector. See [metrics.md](metrics.md) for the underlying metric definitions. + +## Netstat — listening ports + +```promql +# Inventory of publicly-reachable listeners across the fleet +topsrv_netstat_listening_ports{scope="public"} + +# Same, UDP only — DNS / mDNS / WireGuard / unexpected UDP services +topsrv_netstat_listening_ports{proto="udp", scope="public"} + +# Alert: a new public-bound port appeared in the last 10 minutes +# (compare current set against the set seen 10 minutes ago — non-zero rows are new exposures) +(topsrv_netstat_listening_ports{scope="public"} == 1) + unless on (instance, proto, port, family) (topsrv_netstat_listening_ports{scope="public"} offset 10m == 1) + +# Count of distinct public-bound ports per host (capacity / drift watch) +count by (instance) (topsrv_netstat_listening_ports{scope="public"} == 1) +``` + +## Netstat — TCP connection scope + +```promql +# Established TCP connections from the public internet — scan / unexpected exposure signal +topsrv_netstat_tcp_connections{state="ESTABLISHED", direction="inbound", remote_scope="public"} + +# Outbound TCP to the public internet — exfil / unexpected egress watch +sum by (instance) (topsrv_netstat_tcp_connections{direction="outbound", remote_scope="public"}) +``` + +## Angie ACME clients + +```promql +# Inventory of ACME clients per host and their current state +topsrv_angie_acme_state + +# Alert: any client not in cert=valid state +topsrv_angie_acme_state{certificate!="valid"} + +# Time until next ACME action (seconds; negative = overdue / stuck) +topsrv_angie_acme_next_run_seconds - time() +``` diff --git a/internal/app/app.go b/internal/app/app.go index 5322522..d81d726 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -727,6 +727,17 @@ func (a *App) registerAngie(ctx context.Context, services []topsrv.Service) { a.addCollector(nginx.NewSSLCollector(a.Logger, discovered.SSLCertificates)) a.Print(ctx, "angie: monitoring SSL certificates", "count", len(discovered.SSLCertificates)) } + // Operator-facing diagnostic: acme_client directives configured but + // no certs at the default state-path most likely means a custom + // acme_client_path build option we don't auto-discover. Expiry + // metric will be missing for those certs until pinned via static + // ssl_certificate /full/path. + if discovered.ACMEDirectives > discovered.ACMECertsFound { + a.Print(ctx, "angie: ACME certs missing at default state path", + "directives", discovered.ACMEDirectives, + "found_on_disk", discovered.ACMECertsFound, + "default_path", "/var/lib/angie/acme//certificate.pem") + } a.Print(ctx, "angie: auto-discovered access logs", "count", len(angieCfg.AccessLogs), "config", svc.ConfigPath) } else { @@ -740,6 +751,14 @@ func (a *App) registerAngie(ctx context.Context, services []topsrv.Service) { // Register API collector (preferred) or stub_status fallback. if angieCfg.StatusURL != "" { a.addCollector(angie.NewAPICollector(a.Logger, angieCfg.StatusURL)) + // ACME collector polls /status/http/acme_clients/?date=epoch on the + // same angie process; it's a no-op (404) when acme_client isn't + // configured, so it's safe to register unconditionally. + if acme, err := angie.NewACMECollector(a.Logger, angieCfg.StatusURL); err == nil { + a.addCollector(acme) + } else { + a.Error(ctx, "angie: ACME collector init failed", "error", err) + } } else if angieCfg.StubStatusURL != "" { a.addCollector(nginx.NewStubCollector(a.Logger, angieCfg.StubStatusURL)) } diff --git a/internal/topsrv/angie/acme.go b/internal/topsrv/angie/acme.go new file mode 100644 index 0000000..ac8d3e5 --- /dev/null +++ b/internal/topsrv/angie/acme.go @@ -0,0 +1,124 @@ +package angie + +import ( + "context" + "encoding/json" + "net/http" + "net/url" + "strings" + "time" + + "github.com/vmkteam/topsrv/internal/topsrv" + + "github.com/prometheus/client_golang/prometheus" + "github.com/vmkteam/embedlog" +) + +var _ topsrv.Collector = (*ACMECollector)(nil) + +// ACMEClient mirrors one entry of /status/http/acme_clients/ (angie 1.5+). +// state/certificate are operator-facing enums whose set evolves across +// releases — we surface them as Prometheus labels instead of mapping to +// numbers. "details" is a free-form sentence intended for humans; not +// useful as a label (high cardinality, churns with translations) so we +// don't decode it. next_run is int64 because we ask for date=epoch. +type ACMEClient struct { + State string `json:"state"` + Certificate string `json:"certificate"` + NextRun int64 `json:"next_run"` +} + +// ACMECollector polls angie's per-client ACME status so dashboards can see +// when a cert is renewing / failed / due for next attempt without reading +// the PEM off disk (which only tells you NotAfter, not "renewal blocked"). +type ACMECollector struct { + embedlog.Logger + + url string + client *http.Client + + state *prometheus.Desc + nextRun *prometheus.Desc +} + +// NewACMECollector extends APICollector's statusURL with /http/acme_clients/?date=epoch. +// date=epoch turns next_run into a plain Unix int so we skip ISO 8601 parsing. +func NewACMECollector(logger embedlog.Logger, statusURL string) (*ACMECollector, error) { + u, err := url.Parse(statusURL) + if err != nil { + return nil, err + } + const suffix = "/http/acme_clients/" + base := strings.TrimSuffix(u.Path, "/") + if !strings.HasSuffix(base, strings.TrimSuffix(suffix, "/")) { + base += suffix + } else { + base += "/" + } + u.Path = base + u.RawQuery = "date=epoch" + return &ACMECollector{ + Logger: logger, + url: u.String(), + client: &http.Client{Timeout: 5 * time.Second}, + state: prometheus.NewDesc( + "topsrv_angie_acme_state", + "ACME client state (value=1 per name/state/certificate tuple).", + []string{"name", "state", "certificate"}, nil, + ), + nextRun: prometheus.NewDesc( + "topsrv_angie_acme_next_run_seconds", + "Unix timestamp of next ACME action per client.", + []string{"name"}, nil, + ), + }, nil +} + +func (c *ACMECollector) Name() string { return "angie-acme" } + +func (c *ACMECollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.state + ch <- c.nextRun +} + +func (c *ACMECollector) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.url, nil) + if err != nil { + return + } + resp, err := c.client.Do(req) + if err != nil { + c.Error(ctx, "angie-acme: request failed", "error", err) + return + } + defer resp.Body.Close() + + // 404 means acme_client isn't configured on this host — silently skip + // so we don't spam logs every scrape. Other non-200 status are real + // problems worth surfacing. + if resp.StatusCode == http.StatusNotFound { + return + } + if resp.StatusCode != http.StatusOK { + c.Error(ctx, "angie-acme: API returned error status", "status", resp.StatusCode) + return + } + + var clients map[string]ACMEClient + if err := json.NewDecoder(resp.Body).Decode(&clients); err != nil { + c.Error(ctx, "angie-acme: failed to decode response", "error", err) + return + } + + for name, cl := range clients { + ch <- prometheus.MustNewConstMetric(c.state, prometheus.GaugeValue, 1, name, cl.State, cl.Certificate) + // next_run is omitted by angie when state ∈ {disabled, requesting} — + // it decodes as 0; skip emitting so dashboards see "no scheduled action". + if cl.NextRun > 0 { + ch <- prometheus.MustNewConstMetric(c.nextRun, prometheus.GaugeValue, float64(cl.NextRun), name) + } + } +} diff --git a/internal/topsrv/angie/acme_test.go b/internal/topsrv/angie/acme_test.go new file mode 100644 index 0000000..722cd32 --- /dev/null +++ b/internal/topsrv/angie/acme_test.go @@ -0,0 +1,177 @@ +package angie + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/vmkteam/embedlog" +) + +// acmeSampleJSON mirrors the real angie 1.x /status/http/acme_clients/ +// response with date=epoch — map keyed by client name, each entry has +// state / certificate / details / next_run (Unix seconds). +const acmeSampleJSON = `{ + "client_alpha": { + "state": "ready", + "certificate": "valid", + "details": "The client is ready to request a certificate.", + "next_run": 1781333625 + }, + "client_beta": { + "state": "ready", + "certificate": "valid", + "details": "The client is ready to request a certificate.", + "next_run": 1781333641 + } +}` + +// newACMETestServer returns an httptest server that answers the angie +// status API ACME endpoint with handler(body). Other paths get 404. +func newACMETestServer(t *testing.T, handler http.HandlerFunc) *httptest.Server { + t.Helper() + mux := http.NewServeMux() + mux.Handle("/status/http/acme_clients/", handler) + return httptest.NewServer(mux) +} + +func TestACMECollectorHappyPath(t *testing.T) { + srv := newACMETestServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "epoch", r.URL.Query().Get("date")) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(acmeSampleJSON)) + }) + defer srv.Close() + + c, err := NewACMECollector(embedlog.Logger{}, srv.URL+"/status/") + require.NoError(t, err) + + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + mfs, err := reg.Gather() + require.NoError(t, err) + + got := map[string]map[string]float64{} + for _, mf := range mfs { + for _, m := range mf.GetMetric() { + labels := map[string]string{} + for _, l := range m.GetLabel() { + labels[l.GetName()] = l.GetValue() + } + name := mf.GetName() + value := m.GetGauge().GetValue() + if got[name] == nil { + got[name] = map[string]float64{} + } + got[name][fmt.Sprintf("%v", labels)] = value + } + } + + // state marker for both clients + assert.InDelta(t, 1.0, got["topsrv_angie_acme_state"]["map[certificate:valid name:client_alpha state:ready]"], 1e-9) + assert.InDelta(t, 1.0, got["topsrv_angie_acme_state"]["map[certificate:valid name:client_beta state:ready]"], 1e-9) + + // next_run pulled verbatim from epoch field (no string parsing) + assert.InDelta(t, 1_781_333_625, got["topsrv_angie_acme_next_run_seconds"]["map[name:client_alpha]"], 1e-6) + assert.InDelta(t, 1_781_333_641, got["topsrv_angie_acme_next_run_seconds"]["map[name:client_beta]"], 1e-6) +} + +// TestACMECollectorOmitsZeroNextRun — angie omits next_run when state is +// disabled or requesting; it decodes to 0. The collector must not emit a +// next_run metric for those clients (otherwise dashboards see "1970" as +// the next renewal — confusing). +func TestACMECollectorOmitsZeroNextRun(t *testing.T) { + body := `{"only_state":{"state":"requesting","certificate":"missing","details":"in flight"}}` + srv := newACMETestServer(t, func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(body)) + }) + defer srv.Close() + + c, err := NewACMECollector(embedlog.Logger{}, srv.URL+"/status/") + require.NoError(t, err) + + reg := prometheus.NewRegistry() + reg.MustRegister(c) + mfs, err := reg.Gather() + require.NoError(t, err) + + for _, mf := range mfs { + if mf.GetName() != "topsrv_angie_acme_next_run_seconds" { + continue + } + assert.Empty(t, mf.GetMetric(), "next_run must not be emitted when angie omitted it") + } +} + +// TestACMECollector404Silent — when angie doesn't have acme_client configured, +// the endpoint 404s. Collector must stay silent (no logs, no metrics) so a +// host without ACME doesn't generate noise on every scrape. +func TestACMECollector404Silent(t *testing.T) { + srv := newACMETestServer(t, func(w http.ResponseWriter, r *http.Request) { + http.NotFound(w, r) + }) + defer srv.Close() + + c, err := NewACMECollector(embedlog.Logger{}, srv.URL+"/status/") + require.NoError(t, err) + + reg := prometheus.NewRegistry() + reg.MustRegister(c) + mfs, err := reg.Gather() + require.NoError(t, err) + + for _, mf := range mfs { + if mf.GetName() == "topsrv_angie_acme_state" || mf.GetName() == "topsrv_angie_acme_next_run_seconds" { + assert.Empty(t, mf.GetMetric(), "404 must yield no ACME series") + } + } +} + +// TestACMECollectorURLDerivation — collector takes a statusURL (the same +// one APICollector consumes), extends path with /http/acme_clients/, and +// attaches date=epoch. Host/port/scheme must survive untouched; the base +// path (e.g. /status/) must be preserved so we ride into the right +// sub-tree. The double-suffix case guards against a misconfigured +// statusURL that already points at the ACME endpoint — we shouldn't +// build /status/http/acme_clients/http/acme_clients/ on top. +func TestACMECollectorURLDerivation(t *testing.T) { + cases := []struct { + name string + statusURL string + wantScheme string + wantHost string + wantPath string + }{ + {"trailing slash", "http://127.0.0.1:8080/status/", "http", "127.0.0.1:8080", "/status/http/acme_clients/"}, + {"no trailing slash", "http://127.0.0.1:8080/status", "http", "127.0.0.1:8080", "/status/http/acme_clients/"}, + {"https ipv6", "https://[::1]:443/status/", "https", "[::1]:443", "/status/http/acme_clients/"}, + {"named host nondefault port", "http://angie.test:81/status/", "http", "angie.test:81", "/status/http/acme_clients/"}, + {"already at acme endpoint", "http://angie.test:81/status/http/acme_clients/", "http", "angie.test:81", "/status/http/acme_clients/"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + c, err := NewACMECollector(embedlog.Logger{}, tc.statusURL) + require.NoError(t, err) + + u, err := url.Parse(c.url) + require.NoError(t, err) + assert.Equal(t, tc.wantScheme, u.Scheme, "scheme must be preserved") + assert.Equal(t, tc.wantHost, u.Host, "host:port must be preserved") + assert.Equal(t, tc.wantPath, u.Path) + assert.Equal(t, "date=epoch", u.RawQuery) + }) + } +} + +// TestACMECollectorMalformedURL — bad URL surfaces as a constructor error, +// not a runtime panic during Collect. +func TestACMECollectorMalformedURL(t *testing.T) { + _, err := NewACMECollector(embedlog.Logger{}, "://not-a-url") + assert.Error(t, err) +} diff --git a/internal/topsrv/angie/integration_test.go b/internal/topsrv/angie/integration_test.go index 46774c3..ac3c20e 100644 --- a/internal/topsrv/angie/integration_test.go +++ b/internal/topsrv/angie/integration_test.go @@ -71,3 +71,37 @@ func TestIntegrationAngieAPI(t *testing.T) { t.Logf("angie integration: %d metric families", len(mfs)) } + +// TestIntegrationACMECollectorSurvivesNoACME — angie:minimal in docker-compose +// has no acme_client configured, so /status/http/acme_clients/ returns 404. +// The collector must stay silent: no panic, no ERROR-level logs, no series +// emitted. This is the realistic "host with angie but no ACME" path. +func TestIntegrationACMECollectorSurvivesNoACME(t *testing.T) { + url := angieAPIURL() + + resp, err := http.Get(url) + require.NoError(t, err, "angie API not reachable at %s", url) + resp.Body.Close() + require.Equal(t, 200, resp.StatusCode) + + c, err := NewACMECollector(embedlog.Logger{}, url) + require.NoError(t, err) + + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + require.NotPanics(t, func() { + _, err := reg.Gather() + require.NoError(t, err) + }, "ACME collector must not panic on a 404'ing endpoint") + + mfs, err := reg.Gather() + require.NoError(t, err) + for _, mf := range mfs { + switch mf.GetName() { + case "topsrv_angie_acme_state", "topsrv_angie_acme_next_run_seconds": + assert.Empty(t, mf.GetMetric(), + "%s must yield zero series when acme_client isn't configured", mf.GetName()) + } + } +} diff --git a/internal/topsrv/nginx/discover.go b/internal/topsrv/nginx/discover.go index 53e7186..0ec81ef 100644 --- a/internal/topsrv/nginx/discover.go +++ b/internal/topsrv/nginx/discover.go @@ -21,6 +21,13 @@ type DiscoverResult struct { APIStatusPath string // e.g. "/status/", empty if not found (Angie api directive) APIStatusPort int // listen port of the server with api directive APIStatusHost string // listen host of the server with api directive (empty = 127.0.0.1) + + // ACMEDirectives counts acme_client directives we saw; ACMECertsFound + // counts how many of their certificate.pem files were on disk at the + // default state-path. Caller logs a warning when the gap suggests a + // custom acme_client_path the parser doesn't know about. + ACMEDirectives int + ACMECertsFound int } // AccessLogEntry represents a single access_log directive from nginx config. @@ -38,8 +45,18 @@ var ( locationRe = regexp.MustCompile(`location\s+(?:=\s+)?(\S+)\s*\{`) listenRe = regexp.MustCompile(`listen\s+(?:(\S+):)?(\d+)`) sslCertRe = regexp.MustCompile(`ssl_certificate\s+(?:"([^"]+)"|(\S+))\s*;`) + acmeClientRe = regexp.MustCompile(`^\s*acme_client\s+(\S+)\s+`) ) +// Angie's package-install default for ACME state: each acme_client name gets +// a subdir containing certificate.pem. Overridable per-client via the +// acme_client_path directive, which we don't parse — operators with custom +// paths can pin a static ssl_certificate /full/path directive instead. +// Exposed as var (not const) for test isolation. +var defaultACMEStatePath = "/var/lib/angie/acme" + +const acmeCertFileName = "certificate.pem" + // DiscoverConfig parses nginx.conf and all includes, extracting log_format and access_log directives. func DiscoverConfig(configPath string) (*DiscoverResult, error) { result := &DiscoverResult{ @@ -57,6 +74,7 @@ func DiscoverConfig(configPath string) (*DiscoverResult, error) { extractStubStatus(content, result) extractAPIStatus(content, result) extractSSLCertificates(content, dir, result) + result.ACMEDirectives, result.ACMECertsFound = extractACMECertificates(content, result) // Extract access_log directives. for _, m := range accessLogRe.FindAllStringSubmatch(content, -1) { @@ -253,6 +271,59 @@ func extractSSLCertificates(content, baseDir string, result *DiscoverResult) { } } +// extractACMECertificates finds acme_client directives and adds the +// resulting //certificate.pem path to +// SSLCertificates so the SSL collector picks up ACME-managed certs the +// same way it picks up static ones — angie writes the PEM here as part +// of normal renewal. ssl_certificate $acme_cert_ uses a runtime +// variable that the static-path regex skips, so without this step the +// cert is invisible to file-based discovery. +// +// Missing files are silently skipped: angie may not have issued a cert +// yet at discovery time. The SSLCollector itself re-reads every 5 +// minutes, so once the cert lands the expiry metric appears at next +// agent restart. +// +// Returns the number of acme_client directives seen and the number of +// matching cert files found on disk so the caller can warn when the +// state_path doesn't match angie's default (build-time override). +func extractACMECertificates(content string, result *DiscoverResult) (found, picked int) { + seen := make(map[string]bool, len(result.SSLCertificates)) + for _, p := range result.SSLCertificates { + seen[p] = true + } + for _, line := range strings.Split(content, "\n") { + l := strings.TrimSpace(line) + if strings.HasPrefix(l, "#") { + continue + } + m := acmeClientRe.FindStringSubmatch(l) + if m == nil { + continue + } + name := m[1] + // Reject path-traversal: acme_client name is forwarded into a + // filepath.Join, so a malicious config could otherwise escape + // the state path. Names with slashes or .. components are not + // valid angie identifiers anyway. + if name == "" || name == "." || name == ".." || strings.ContainsAny(name, `/\`) { + continue + } + found++ + path := filepath.Join(defaultACMEStatePath, name, acmeCertFileName) + if seen[path] { + continue + } + if _, err := os.Stat(path); err != nil { + continue + } + seen[path] = true + result.SSLCertificates = append(result.SSLCertificates, path) + picked++ + } + return found, picked +} + // extractQuotedParts extracts content from single-quoted strings in a line. func extractQuotedParts(line string) []string { var parts []string diff --git a/internal/topsrv/nginx/ssl_test.go b/internal/topsrv/nginx/ssl_test.go index d078f6a..74629cb 100644 --- a/internal/topsrv/nginx/ssl_test.go +++ b/internal/topsrv/nginx/ssl_test.go @@ -346,3 +346,125 @@ func TestDiscoverSSLRelativePath(t *testing.T) { require.Len(t, result.SSLCertificates, 1) assert.Equal(t, filepath.Join(dir, "ssl/myshows.me.fullchain.cer"), result.SSLCertificates[0]) } + +// withTestACMEStatePath redirects defaultACMEStatePath to a t.TempDir with cleanup. +func withTestACMEStatePath(t *testing.T) string { + t.Helper() + tmp := t.TempDir() + orig := defaultACMEStatePath + defaultACMEStatePath = tmp + t.Cleanup(func() { defaultACMEStatePath = orig }) + return tmp +} + +// writeACMEClientCert plants a self-signed certificate.pem in the angie +// state-path layout: //certificate.pem. +func writeACMEClientCert(t *testing.T, statePath, client, cn string) { + t.Helper() + dir := filepath.Join(statePath, client) + require.NoError(t, os.MkdirAll(dir, 0o755)) + certFile := filepath.Join(dir, "certificate.pem") + + src := generateTestCert(t, cn, time.Now().Add(60*24*time.Hour)) + data, err := os.ReadFile(src) + require.NoError(t, err) + require.NoError(t, os.WriteFile(certFile, data, 0o600)) +} + +// TestExtractACMECertificatesPicksUpRealLayout — regression: `ssl_certificate +// $var` defeats the static-path regex, so we must fall through to the +// default state-path layout to find ACME-managed certs. +func TestExtractACMECertificatesPicksUpRealLayout(t *testing.T) { + tmp := withTestACMEStatePath(t) + + // Plant two ACME client certs the way angie actually lays them out + // (verified on a real angie 1.x install). + writeACMEClientCert(t, tmp, "client_alpha", "alpha.example") + writeACMEClientCert(t, tmp, "client_beta", "beta.example") + + // Minimal angie.conf with two acme_client directives + the variable- + // based ssl_certificate that defeats static-path discovery. + content := strings.Join([]string{ + "acme_client client_alpha https://acme-v02.api.letsencrypt.org/directory;", + "acme_client client_beta https://acme-v02.api.letsencrypt.org/directory;", + "ssl_certificate $acme_cert_client_alpha;", + }, "\n") + + result := &DiscoverResult{} + extractACMECertificates(content, result) + + assert.ElementsMatch(t, []string{ + filepath.Join(tmp, "client_alpha", "certificate.pem"), + filepath.Join(tmp, "client_beta", "certificate.pem"), + }, result.SSLCertificates) +} + +// TestExtractACMECertificatesSkipsMissingFiles — acme_client exists in the +// config but the cert isn't on disk yet (e.g. angie hasn't issued it). +// Discovery must silently skip rather than emitting a bad path. +func TestExtractACMECertificatesSkipsMissingFiles(t *testing.T) { + withTestACMEStatePath(t) // empty tmp — no certs on disk + + content := "acme_client absent_client https://acme-v02.api.letsencrypt.org/directory;" + result := &DiscoverResult{} + extractACMECertificates(content, result) + + assert.Empty(t, result.SSLCertificates) +} + +// TestExtractACMECertificatesDedupes — if the same cert path is already +// present (e.g. operator pinned it via a literal ssl_certificate too), +// don't add it twice. +func TestExtractACMECertificatesDedupes(t *testing.T) { + tmp := withTestACMEStatePath(t) + writeACMEClientCert(t, tmp, "shared", "shared.example") + pinnedPath := filepath.Join(tmp, "shared", "certificate.pem") + + content := "acme_client shared https://acme-v02.api.letsencrypt.org/directory;" + result := &DiscoverResult{SSLCertificates: []string{pinnedPath}} + extractACMECertificates(content, result) + + assert.Equal(t, []string{pinnedPath}, result.SSLCertificates) +} + +// TestExtractACMECertificatesSkipsCommented — operators frequently leave +// commented-out `# acme_client old_name ...;` lines after migration; the +// regex must not match them, otherwise a stale cert directory left on +// disk would resurrect as a monitored series. +func TestExtractACMECertificatesSkipsCommented(t *testing.T) { + tmp := withTestACMEStatePath(t) + writeACMEClientCert(t, tmp, "old_client", "old.example") + + content := strings.Join([]string{ + "# acme_client old_client https://acme-v02.api.letsencrypt.org/directory;", + " #acme_client also_commented https://acme-v02.api.letsencrypt.org/directory;", + }, "\n") + + result := &DiscoverResult{} + found, picked := extractACMECertificates(content, result) + + assert.Empty(t, result.SSLCertificates) + assert.Zero(t, found, "commented directives must not count") + assert.Zero(t, picked) +} + +// TestExtractACMECertificatesRejectsTraversal — defense in depth: a name +// containing path separators or .. would let a misconfigured acme_client +// escape defaultACMEStatePath when filepath.Join cleans the path. Skip +// such names rather than serving up arbitrary paths. +func TestExtractACMECertificatesRejectsTraversal(t *testing.T) { + withTestACMEStatePath(t) + + content := strings.Join([]string{ + "acme_client ../escape https://acme-v02.api.letsencrypt.org/directory;", + "acme_client a/b https://acme-v02.api.letsencrypt.org/directory;", + `acme_client a\b https://acme-v02.api.letsencrypt.org/directory;`, + "acme_client . https://acme-v02.api.letsencrypt.org/directory;", + }, "\n") + + result := &DiscoverResult{} + found, _ := extractACMECertificates(content, result) + + assert.Empty(t, result.SSLCertificates) + assert.Zero(t, found, "traversal-shaped names must be rejected before counting") +}