diff --git a/agent-schema.json b/agent-schema.json index 758f5a361..c546e6e12 100644 --- a/agent-schema.json +++ b/agent-schema.json @@ -1402,24 +1402,26 @@ }, "allowed_domains": { "type": "array", - "description": "Allow-list of domains the fetch tool is permitted to fetch (only valid for type 'fetch'). A pattern matches the host exactly (case-insensitive) and any of its subdomains; e.g. 'example.com' matches 'example.com' and 'docs.example.com' but not 'badexample.com'. A leading dot ('.example.com') restricts the match to strict subdomains. Mutually exclusive with 'blocked_domains'.", + "description": "Allow-list of domains the fetch tool is permitted to fetch (only valid for type 'fetch'). Patterns are case-insensitive. A bare host ('example.com') matches the apex AND any subdomain. A leading dot ('.example.com') or wildcard ('*.example.com') matches strict subdomains only. CIDR ranges ('10.0.0.0/8', '::1/128') match when the URL host parses as an IP inside the network. Mutually exclusive with 'blocked_domains'.", "items": { "type": "string" }, "examples": [ ["docker.com", "docs.docker.com"], - ["github.com", "raw.githubusercontent.com"] + ["github.com", "raw.githubusercontent.com"], + ["*.example.com"] ] }, "blocked_domains": { "type": "array", - "description": "Deny-list of domains the fetch tool is forbidden to fetch (only valid for type 'fetch'). Uses the same matching rules as 'allowed_domains'. Mutually exclusive with 'allowed_domains'.", + "description": "Deny-list of domains the fetch tool is forbidden to fetch (only valid for type 'fetch'). Uses the same matching rules as 'allowed_domains' (bare host, leading-dot or '*.' subdomain wildcard, and CIDR ranges for IP hosts). Mutually exclusive with 'allowed_domains'.", "items": { "type": "string" }, "examples": [ ["internal.example.com"], - ["169.254.169.254"] + ["169.254.169.254"], + ["169.254.0.0/16", "10.0.0.0/8"] ] }, "url": { diff --git a/docs/tools/fetch/index.md b/docs/tools/fetch/index.md index ad64ffba2..d465ccd13 100644 --- a/docs/tools/fetch/index.md +++ b/docs/tools/fetch/index.md @@ -40,7 +40,9 @@ Domain patterns in `allowed_domains` and `blocked_domains` use the following rul - **Bare domain** — `example.com` matches the host `example.com` _and_ any subdomain such as `docs.example.com`. It does **not** match unrelated hosts that share a suffix (e.g. `badexample.com`). - **Leading dot** — `.example.com` matches **only** strict subdomains (`docs.example.com`, `a.b.example.com`), not the apex `example.com`. +- **Wildcard glob** — `*.example.com` is an alias for the leading-dot form; the apex is excluded. The `*` is only valid as a leading `*.` token (entries like `foo.*`, `*.*.example.com`, or a bare `*` are rejected at config-load time). - **IP literal** — IP addresses are matched exactly (`169.254.169.254`). +- **CIDR range** — `169.254.0.0/16`, `10.0.0.0/8`, `::1/128`, `fc00::/7`. Matches when the URL's host parses as an IP inside the network. Hostname hosts never match a CIDR pattern. Malformed CIDRs are rejected at config-load time. - **Trailing dots** in FQDN-form URLs (`http://example.com./`) are stripped before matching, so they cannot bypass a deny-list entry. The lists are mutually exclusive: a single fetch toolset may set either `allowed_domains` or `blocked_domains`, but not both. @@ -50,7 +52,7 @@ When a list is configured, every redirect target is re-checked against the same
⚠️ Limitations
-

Matching is purely string-based on the URL host. It does not perform DNS resolution and does not normalise alternative IP encodings (decimal 2852039166, hex 0xa9.0xfe.0xa9.0xfe, octal, IPv4-mapped IPv6, etc.). If you need to deny access to a specific IP, also list its alternative encodings, or block at the network layer.

+

Matching is purely string-based on the URL host. It does not perform DNS resolution and does not normalise alternative IP encodings (decimal 2852039166, hex 0xa9.0xfe.0xa9.0xfe, octal, etc. IPv4-mapped IPv6 addresses ARE normalized to their IPv4 form). If you need to deny access to a specific IP, also list its alternative encodings, or block at the network layer.

### Custom Timeout @@ -78,8 +80,11 @@ toolsets: toolsets: - type: fetch blocked_domains: - - 169.254.169.254 # cloud metadata endpoint - - internal.example.com # internal corporate hostnames + - 169.254.169.254 # cloud metadata endpoint (literal IP) + - 169.254.0.0/16 # entire link-local range (CIDR) + - 10.0.0.0/8 # RFC1918 private range + - "*.internal.example.com" # any subdomain (wildcard) + - internal.example.com # internal corporate hostname ``` ## Tool Interface diff --git a/examples/fetch_domain_filtering.yaml b/examples/fetch_domain_filtering.yaml index cb1c8c3fa..9da8d11a0 100644 --- a/examples/fetch_domain_filtering.yaml +++ b/examples/fetch_domain_filtering.yaml @@ -36,6 +36,9 @@ agents: toolsets: - type: fetch blocked_domains: - - 169.254.169.254 # cloud instance metadata - - 100.100.100.200 # alibaba/oracle metadata + - 169.254.169.254 # cloud instance metadata (literal IP) + - 169.254.0.0/16 # whole link-local range (CIDR) + - 10.0.0.0/8 # RFC1918 private network + - 100.100.100.200 # alibaba/oracle metadata - metadata.google.internal + - "*.internal.example.com" # any subdomain (wildcard glob) diff --git a/pkg/config/latest/validate.go b/pkg/config/latest/validate.go index 42787f87b..859fd16c1 100644 --- a/pkg/config/latest/validate.go +++ b/pkg/config/latest/validate.go @@ -215,14 +215,46 @@ func (t *Toolset) validate() error { return nil } -// validateDomainPatterns rejects empty / whitespace-only entries in a fetch -// allow- or block-list, since they silently match nothing and turn the list -// into a foot-gun (e.g. allowed_domains: [""] would reject every URL). +// validateDomainPatterns rejects empty / whitespace-only entries and +// malformed wildcard or CIDR patterns in a fetch allow- or block-list. +// +// Catching these at config-load time turns silent foot-guns (e.g. +// `allowed_domains: [""]` rejecting every URL, `*.foo.*` matching nothing) +// into actionable errors. Plain hostnames and the leading-dot subdomain form +// are intentionally not validated for syntax — the matcher is purely +// string-based and any non-conforming entry simply never matches. func validateDomainPatterns(field string, patterns []string) error { for i, p := range patterns { - if strings.TrimSpace(p) == "" { + trimmed := strings.TrimSpace(p) + if trimmed == "" { return fmt.Errorf("%s[%d] must not be empty", field, i) } + if err := validateDomainPattern(trimmed); err != nil { + return fmt.Errorf("%s[%d] %q is invalid: %w", field, i, p, err) + } + } + return nil +} + +// validateDomainPattern checks a single (already trimmed, non-empty) entry. +func validateDomainPattern(p string) error { + // CIDR notation: must parse cleanly. We deliberately accept any /-bearing + // string as "intended to be a CIDR" so a typo like "10.0.0.0/33" is + // reported instead of being silently treated as a hostname. + if strings.Contains(p, "/") { + if _, _, err := net.ParseCIDR(p); err != nil { + return fmt.Errorf("not a valid CIDR: %w", err) + } + return nil + } + // Wildcards: only the leading "*." form is supported. Anything else + // ("foo.*", "*foo*", "**.example.com") would silently match nothing + // under the current matcher, which is almost never what the user wants. + if strings.Contains(p, "*") { + rest, ok := strings.CutPrefix(p, "*.") + if !ok || rest == "" || strings.Contains(rest, "*") { + return errors.New("'*' is only allowed as a leading '*.' wildcard, e.g. '*.example.com'") + } } return nil } diff --git a/pkg/config/latest/validate_test.go b/pkg/config/latest/validate_test.go index 278aa3d70..e5945e252 100644 --- a/pkg/config/latest/validate_test.go +++ b/pkg/config/latest/validate_test.go @@ -327,6 +327,106 @@ agents: `, wantErr: "blocked_domains[0] must not be empty", }, + { + name: "fetch with wildcard subdomain pattern", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + allowed_domains: + - "*.example.com" +`, + wantErr: "", + }, + { + name: "fetch with ipv4 CIDR pattern", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + blocked_domains: + - 169.254.0.0/16 + - 10.0.0.0/8 +`, + wantErr: "", + }, + { + name: "fetch with ipv6 CIDR pattern", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + blocked_domains: + - "fc00::/7" + - "::1/128" +`, + wantErr: "", + }, + { + name: "malformed CIDR is rejected", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + blocked_domains: + - 10.0.0.0/33 +`, + wantErr: "not a valid CIDR", + }, + { + name: "interior wildcard is rejected", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + allowed_domains: + - "foo.*" +`, + wantErr: "'*' is only allowed as a leading '*.' wildcard", + }, + { + name: "double wildcard is rejected", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + allowed_domains: + - "*.*.example.com" +`, + wantErr: "'*' is only allowed as a leading '*.' wildcard", + }, + { + name: "bare star is rejected", + config: ` +version: "8" +agents: + root: + model: "openai/gpt-4" + toolsets: + - type: fetch + allowed_domains: + - "*" +`, + wantErr: "'*' is only allowed as a leading '*.' wildcard", + }, } for _, tt := range tests { diff --git a/pkg/tools/builtin/fetch.go b/pkg/tools/builtin/fetch.go index af85c994f..235608e70 100644 --- a/pkg/tools/builtin/fetch.go +++ b/pkg/tools/builtin/fetch.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "net" "net/http" "net/url" "slices" @@ -296,18 +297,85 @@ func (h *fetchHandler) checkDomainAllowed(u *url.URL) error { // matchesDomain reports whether host matches pattern (case-insensitive). // -// A bare pattern ("example.com") matches the host exactly or any subdomain -// ("docs.example.com"); it does NOT match unrelated hosts that share a suffix -// ("badexample.com"). A pattern with a leading dot (".example.com") matches -// strict subdomains only — the apex "example.com" is excluded. +// Supported pattern shapes: +// +// - **Bare domain** ("example.com") matches the host exactly _or_ any +// subdomain ("docs.example.com"); it does NOT match unrelated hosts that +// share a suffix ("badexample.com"). +// - **Leading dot** (".example.com") matches strict subdomains only — the +// apex "example.com" is excluded. +// - **Wildcard glob** ("*.example.com") is an alias for the leading-dot +// form: it matches strict subdomains only. No other use of "*" is +// supported (e.g. "foo.*", "*foo*" are rejected by validation and would +// never match here). +// - **CIDR** ("10.0.0.0/8", "169.254.0.0/16", "::1/128", "fc00::/7") +// matches when the host parses as an IP address inside the network. +// Hostname hosts never match a CIDR pattern. // // Trailing dots used in FQDN form ("example.com.") are stripped from both // host and pattern before matching, so a URL like http://example.com./ cannot // be used to bypass a deny-list entry for example.com. func matchesDomain(host, pattern string) bool { - host = strings.TrimSuffix(strings.ToLower(strings.TrimSpace(host)), ".") - pattern = strings.TrimSuffix(strings.ToLower(strings.TrimSpace(pattern)), ".") - if host == "" || pattern == "" || pattern == "." { + host = strings.TrimSpace(host) + pattern = strings.TrimSpace(pattern) + if host == "" || pattern == "" { + return false + } + + // CIDR pattern: the host must parse as an IP address inside the network. + // CIDRs always contain '/', so we can detect them cheaply before any other + // normalisation. Hostname-style hosts never match a CIDR pattern. + if strings.Contains(pattern, "/") { + if _, ipNet, err := net.ParseCIDR(pattern); err == nil { + // url.Hostname() already strips IPv6 brackets, but be defensive. + ipStr := strings.TrimSuffix(strings.Trim(host, "[]"), ".") + if ip := net.ParseIP(ipStr); ip != nil { + // Normalize IPv4-mapped IPv6 addresses (::ffff:a.b.c.d) to their + // IPv4 form before checking CIDR membership. Without this, an + // attacker can bypass an IPv4 deny-list like "169.254.0.0/16" by + // using the IPv6-mapped form "::ffff:169.254.169.254". + // + // net.IP.To4() returns nil for "true" IPv6 addresses and the + // 4-byte IPv4 form for IPv4 or IPv4-mapped-IPv6. + if ipv4 := ip.To4(); ipv4 != nil { + return ipNet.Contains(ipv4) + } + return ipNet.Contains(ip) + } + return false + } + // Malformed CIDRs are rejected at config-load time; if one slips + // through (e.g. via the programmatic API), fall through to the + // string matcher below, which will never match a host. + } + + // Normalize IPv4-mapped IPv6 addresses to their IPv4 form for string + // comparison. This ensures that "::ffff:169.254.169.254" matches a + // literal pattern "169.254.169.254" (and vice versa). + if ip := net.ParseIP(strings.Trim(host, "[]")); ip != nil { + if ipv4 := ip.To4(); ipv4 != nil { + host = ipv4.String() + } else { + host = ip.String() + } + } + if ip := net.ParseIP(strings.Trim(pattern, "[]")); ip != nil { + if ipv4 := ip.To4(); ipv4 != nil { + pattern = ipv4.String() + } else { + pattern = ip.String() + } + } + + host = strings.TrimSuffix(strings.ToLower(host), ".") + pattern = strings.TrimSuffix(strings.ToLower(pattern), ".") + + // Wildcard glob "*.example.com" is an alias for ".example.com". + if rest, ok := strings.CutPrefix(pattern, "*."); ok { + pattern = "." + rest + } + + if pattern == "" || pattern == "." { return false } if strings.HasPrefix(pattern, ".") { diff --git a/pkg/tools/builtin/fetch_test.go b/pkg/tools/builtin/fetch_test.go index 2f71dd080..3bcc8e564 100644 --- a/pkg/tools/builtin/fetch_test.go +++ b/pkg/tools/builtin/fetch_test.go @@ -439,6 +439,33 @@ func TestMatchesDomain(t *testing.T) { {"trailing dot host matches subdomain pattern", "docs.example.com.", "example.com", true}, {"trailing dot pattern matches apex host", "example.com", "example.com.", true}, {"trailing dot host matches strict-subdomain pattern", "docs.example.com.", ".example.com", true}, + + // Wildcard glob form: alias for the leading-dot strict-subdomain match. + {"wildcard matches subdomain", "docs.example.com", "*.example.com", true}, + {"wildcard matches deep subdomain", "a.b.example.com", "*.example.com", true}, + {"wildcard does NOT match apex", "example.com", "*.example.com", false}, + {"wildcard does NOT match unrelated suffix", "badexample.com", "*.example.com", false}, + {"wildcard with trailing dot host", "docs.example.com.", "*.example.com", true}, + {"interior wildcard never matches (defense in depth)", "foo.example.com", "foo.*", false}, + + // CIDR form. + {"ipv4 inside /16", "169.254.169.254", "169.254.0.0/16", true}, + {"ipv4 outside /16", "10.0.0.1", "169.254.0.0/16", false}, + {"ipv4 /32 exact", "169.254.169.254", "169.254.169.254/32", true}, + {"ipv4 /32 mismatch", "169.254.169.255", "169.254.169.254/32", false}, + {"private /8", "10.1.2.3", "10.0.0.0/8", true}, + {"hostname does not match cidr", "example.com", "10.0.0.0/8", false}, + {"ipv6 loopback", "::1", "::1/128", true}, + {"ipv6 ula", "fc00::1234", "fc00::/7", true}, + {"ipv6 outside ula", "2001:db8::1", "fc00::/7", false}, + {"malformed cidr never matches", "169.254.169.254", "10.0.0.0/33", false}, + + // IPv4-mapped IPv6 bypass prevention (SSRF defense). + {"ipv4-mapped ipv6 matches ipv4 cidr", "::ffff:169.254.169.254", "169.254.0.0/16", true}, + {"ipv4-mapped ipv6 matches ipv4 /32", "::ffff:10.0.0.1", "10.0.0.1/32", true}, + {"ipv4-mapped ipv6 outside ipv4 cidr", "::ffff:10.0.0.1", "169.254.0.0/16", false}, + {"ipv4-mapped ipv6 matches ipv4 literal", "::ffff:169.254.169.254", "169.254.169.254", true}, + {"ipv4 literal matches ipv4-mapped ipv6 cidr (edge case)", "169.254.169.254", "::ffff:169.254.0.0/112", true}, } for _, tc := range tests { @@ -585,3 +612,35 @@ func TestFetch_BlockedDomains_RejectsRedirectToBlockedHost(t *testing.T) { assert.Contains(t, result.Output, "is blocked by blocked_domains") assert.Contains(t, result.Output, "169.254.169.254") } + +// Additional edge case tests for security review +func TestMatchesDomain_EdgeCases(t *testing.T) { + tests := []struct { + name string + host string + pattern string + want bool + }{ + // Port handling (should be stripped by url.Hostname() before matchesDomain is called) + {"host with port should not match", "example.com:8080", "example.com", false}, + + // IPv6 with zone ID (should be stripped by url.Hostname()) + {"ipv6 with zone id should not match", "fe80::1%eth0", "fe80::1", false}, + + // Empty CIDR prefix + {"empty after wildcard", "example.com", "*.", false}, + + // Case sensitivity in IPv6 + {"ipv6 case insensitive", "2001:DB8::1", "2001:db8::1", true}, + + // Brackets in pattern (defensive) + {"brackets in pattern", "::1", "[::1]", true}, + {"brackets in host", "[::1]", "::1", true}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, matchesDomain(tc.host, tc.pattern)) + }) + } +}