ethos-link · fidalgo · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        ruby-version: ["3.2", "3.4"]
+        ruby-version: ["3.3", "3.4", "4.0.3"]
 
     steps:
       - uses: actions/checkout@v4

diff --git a/README.md b/README.md
@@ -32,6 +32,8 @@ The default rule set includes:
 
 ## Installation
 
+Crawlscope requires Ruby 3.3 or newer.
+
 Add this line to your application's Gemfile:
 
 ```ruby
@@ -166,6 +168,7 @@ Available environment overrides:
 - `TIMEOUT=30`
 - `NETWORK_IDLE_TIMEOUT=10`
 - `CONCURRENCY=5`
+- `FETCH_EXECUTOR=threaded` or `FETCH_EXECUTOR=async`
 
 Available tasks:
 
@@ -196,6 +199,12 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
 Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
 task-argument form above instead.
 
+`FETCH_EXECUTOR=async` is the default for HTTP crawling. It uses Ruby's fiber
+scheduler and Async::HTTP through Faraday, preserving the same `CONCURRENCY`
+bound. Use `FETCH_EXECUTOR=threaded` or `--fetch-executor threaded` for the
+thread-pool executor. Browser rendering uses the threaded executor by default
+because async fetch execution is only supported with HTTP rendering.
+
 `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 
 ### Structured Data URL Audit

diff --git a/Rakefile b/Rakefile
@@ -183,7 +183,7 @@ namespace :release do
 end
 
 namespace :crawlscope do
-  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR"
   task :validate, [:url, :sitemap, :rules] do |_task, args|
     Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
   end

diff --git a/UPGRADE.md b/UPGRADE.md
@@ -4,4 +4,19 @@ Use this file for host-app migration notes when a release changes public
 contracts, required setup, component locals, generated assets, or runtime
 behavior.
 
-No special upgrade notes have been published yet.
+## Next Release
+
+### Ruby 3.3 is now required
+
+Crawlscope now depends on the current Async runtime for production async HTTP
+fetching. Host applications must run Ruby 3.3 or newer before upgrading.
+
+Recommended migration:
+
+1. Upgrade the host application runtime to Ruby 3.3 or newer.
+2. Run `bundle update crawlscope async async-http async-http-faraday`.
+3. Crawlscope now uses `FETCH_EXECUTOR=async` by default for HTTP crawling.
+4. Set `FETCH_EXECUTOR=threaded` or pass `--fetch-executor threaded` for a
+   conservative rollout or for explicit thread-pool execution.
+5. Browser rendering continues to use threaded execution by default because
+   async fetch execution is only supported with HTTP rendering.
diff --git a/async-performance-assessment.md b/async-performance-assessment.md
@@ -0,0 +1,167 @@
+# Async Performance Assessment
+
+Date: 2026-06-01
+
+## Conclusion
+
+Async HTTP is not at least 2x faster than Crawlscope's existing threaded fetch
+baseline at the same concurrency.
+
+It is much faster than sequential fetching, but Crawlscope was already parallel
+before this work. The meaningful comparison is therefore:
+
+- `fetch_executor: :threaded`, `CONCURRENCY=N`
+- `fetch_executor: :async`, `CONCURRENCY=N`
+
+On that comparison, async is roughly 1.01x to 1.11x faster in most local
+delayed-response scenarios, with one Ruby 3.4.8 short-delay/high-concurrency
+case reaching 1.35x. It does not reach 2x.
+
+The additional production work did produce a 2x+ improvement in a different
+place: child sitemap expansion now uses the bounded fetch executor instead of
+walking sitemap indexes serially. In the local delayed sitemap benchmark,
+bounded expansion is about 7.4x to 7.6x faster than sequential expansion on
+Ruby 3.4.8 and Ruby 4.0.3.
+
+## Benchmark Setup
+
+Benchmarks use a local delayed HTTP server and full `Crawlscope::Crawl` runs,
+not isolated HTTP calls.
+
+Measured scenarios:
+
+- direct sitemap pages,
+- uncrawled internal link targets resolved by the links rule,
+- 48 delayed pages or targets,
+- 20ms and 80ms response delays,
+- concurrency 8 and 16,
+- median of 3 runs per executor.
+
+Scripts:
+
+- `test/performance/async_fetch_benchmark.rb`
+- `test/performance/fetch_executor_matrix.rb`
+- `test/performance/sitemap_expansion_benchmark.rb`
+
+## Results
+
+### Ruby 4.0.3
+
+| Scenario | Threaded | Async | Async vs Threaded |
+| --- | ---: | ---: | ---: |
+| direct pages, 20ms, c8 | 0.159s | 0.147s | 1.08x |
+| direct pages, 20ms, c16 | 0.092s | 0.088s | 1.04x |
+| direct pages, 80ms, c8 | 0.528s | 0.521s | 1.01x |
+| direct pages, 80ms, c16 | 0.280s | 0.275s | 1.02x |
+| link targets, 20ms, c8 | 0.179s | 0.170s | 1.06x |
+| link targets, 80ms, c8 | 0.616s | 0.601s | 1.02x |
+
+### Ruby 3.4.8
+
+| Scenario | Threaded | Async | Async vs Threaded |
+| --- | ---: | ---: | ---: |
+| direct pages, 20ms, c8 | 0.163s | 0.159s | 1.03x |
+| direct pages, 20ms, c16 | 0.110s | 0.082s | 1.35x |
+| direct pages, 80ms, c8 | 0.524s | 0.518s | 1.01x |
+| direct pages, 80ms, c16 | 0.278s | 0.271s | 1.02x |
+| link targets, 20ms, c8 | 0.190s | 0.170s | 1.11x |
+| link targets, 80ms, c8 | 0.610s | 0.607s | 1.01x |
+
+The earlier simple benchmark also showed the same pattern:
+
+| Runtime | Sequential threaded | Threaded c8 | Async c8 |
+| --- | ---: | ---: | ---: |
+| Ruby 4.0.3 | 2.141s | 0.284s | 0.305s |
+| Ruby 3.4.8 | 2.133s | 0.272s | 0.328s |
+
+### Child Sitemap Expansion
+
+This benchmark measures a sitemap index with eight delayed child sitemaps.
+The first row forces a sequential executor by setting concurrency to 1. The
+bounded rows use the same crawl path with concurrency 8.
+
+| Runtime | Sequential | Threaded c8 | Async c8 | Best Speedup |
+| --- | ---: | ---: | ---: | ---: |
+| Ruby 4.0.3 | 2.123s | 0.280s | 0.285s | 7.58x |
+| Ruby 3.4.8 | 2.100s | 0.284s | 0.276s | 7.61x |
+
+## Why Async Is Not 2x Faster
+
+The current threaded implementation is already near the latency lower bound.
+
+For 48 pages at 80ms delay and concurrency 16, the theoretical network floor is
+roughly:
+
+```text
+ceil(48 / 16) * 0.08s = 0.24s
+```
+
+Measured results:
+
+- threaded: 0.279s on Ruby 4.0.3
+- async: 0.267s on Ruby 4.0.3
+
+That leaves only about 39ms of overhead for threaded execution to eliminate.
+Async cannot produce a 2x improvement when the threaded baseline is already
+within about 16% of the ideal network floor.
+
+The main reasons:
+
+1. Crawlscope already had bounded parallel fetching through
+   `Concurrent::FixedThreadPool`.
+2. The benchmark is I/O latency dominated, not CPU or thread scheduling
+   dominated.
+3. Faraday still adds request/response abstraction overhead in both modes.
+4. Local HTTP/1.1 requests do not create a major multiplexing advantage for
+   async.
+5. The async executor improves resource shape more than wall-clock time when
+   the thread pool is already sized correctly.
+
+## What Did Improve
+
+This work still improves the architecture and specific crawl paths:
+
+- async HTTP is now a real transport through `async-http-faraday`,
+- fetch executor selection is explicit and documented,
+- browser rendering is guarded from async misuse,
+- output ordering remains stable,
+- uncrawled link targets are resolved as a bounded batch instead of repeatedly
+  through the single-target path,
+- child sitemap indexes are expanded through the same bounded executor,
+- canonical target resolution reuses the link-target cache,
+- per-page link extraction and uniqueness summaries can run in bounded parallel
+  when the selected executor supports it.
+
+The biggest speed improvement is not async versus threads. It is batching extra
+network-bound and per-page rule work through the same bounded executor. In slow
+child-sitemap or link-target cases, the batch shape turns what would otherwise
+be a serial second wave into roughly `ceil(items / concurrency) * latency`.
+
+## Recommendation
+
+Keep the async implementation, but do not claim a 2x wall-clock speedup over
+the current threaded executor.
+
+Position it as:
+
+- production-ready for HTTP crawling,
+- useful for fiber-scheduler deployments,
+- potentially better at higher concurrency with lower thread pressure,
+- equivalent-to-slightly-faster than threaded for the measured local workloads.
+
+If the product goal is a reliable 2x improvement over the previous threaded
+baseline, the next performance work should not be "more async." It should
+target:
+
+1. persistent fetch/result caching across rule phases,
+2. optional higher concurrency with per-host rate limits,
+3. streaming per-page analysis so CPU work overlaps network waits,
+4. reducing retained response bodies when rules only need parsed document state.
+
+## Decision
+
+Async is production-ready, but it is not a 2x speed feature against the existing
+threaded baseline. The production claim should be about scalability and
+executor choice. The measured 2x+ speedup comes from applying bounded
+parallelism to previously serial crawl phases, especially child sitemap
+expansion.
diff --git a/crawlscope.gemspec b/crawlscope.gemspec
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
   spec.description = "A small Ruby gem for sitemap-driven SEO validation with structured issues, configurable rules and schema registries, optional browser rendering, and Rails rake task integration."
   spec.homepage = "https://www.ethos-link.com/opensource/crawlscope"
   spec.license = "MIT"
-  spec.required_ruby_version = ">= 3.2.0"
+  spec.required_ruby_version = ">= 3.3.0"
 
   repo = "https://github.com/ethos-link/crawlscope"
   branch = "main"
@@ -46,6 +46,8 @@ Gem::Specification.new do |spec|
   spec.require_paths = ["lib"]
 
   spec.add_dependency "concurrent-ruby", ">= 1.3"
+  spec.add_dependency "async", ">= 2.0"
+  spec.add_dependency "async-http-faraday", ">= 0.22"
   spec.add_dependency "faraday", ">= 2.0"
   spec.add_dependency "faraday-follow_redirects", ">= 0.3"
   spec.add_dependency "json-schema", ">= 5.0"

diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb
@@ -134,6 +134,8 @@ def run_validate
 
       configure_renderer(resolved_renderer)
       @configuration.concurrency = resolved_concurrency
+      fetch_executor_configured = !normalized_string(ENV["FETCH_EXECUTOR"]).nil?
+      @configuration.fetch_executor = resolved_fetch_executor
       @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
       @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
 
@@ -167,9 +169,15 @@ def run_validate
         opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
           @configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
         end
+
+        opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value|
+          fetch_executor_configured = true
+          @configuration.fetch_executor = value
+        end
       end
 
       parser.parse!(@argv)
+      @configuration.fetch_executor = :threaded if @configuration.renderer == :browser && !fetch_executor_configured
 
       result = task.validate(
         base_url: options[:url],
@@ -221,6 +229,14 @@ def resolved_concurrency
       end
     end
 
+    def resolved_fetch_executor
+      configured_executor = normalized_string(ENV["FETCH_EXECUTOR"])
+      return configured_executor if configured_executor
+      return :threaded if @configuration.renderer == :browser
+
+      @configuration.fetch_executor
+    end
+
     def resolved_integer(name, default:, minimum:)
       raw_value = normalized_string(ENV[name])
       return default if raw_value.nil?

diff --git a/lib/crawlscope/configuration.rb b/lib/crawlscope/configuration.rb
@@ -7,10 +7,11 @@ class Configuration
     DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
     DEFAULT_BROWSER_SCROLL_PAGE = true
     DEFAULT_CONCURRENCY = 10
+    DEFAULT_FETCH_EXECUTOR = :async
     RENDERERS = %i[http browser].freeze
     DEFAULT_TIMEOUT_SECONDS = 20
 
-    attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
+    attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
 
     def allowed_statuses
       value = resolve(@allowed_statuses)
@@ -30,6 +31,13 @@ def concurrency
       positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
     end
 
+    def fetch_executor
+      value = resolve(@fetch_executor)
+      default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR
+
+      FetchExecutor.normalize(value.nil? ? default : value)
+    end
+
     def browser_concurrency
       value = concurrency
       default_value = DEFAULT_BROWSER_CONCURRENCY
@@ -83,6 +91,7 @@ def audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names:
         sitemap_path: sitemap_path,
         browser_factory: browser_factory,
         concurrency: concurrency,
+        fetch_executor: fetch_executor,
         network_idle_timeout_seconds: network_idle_timeout_seconds,
         renderer: renderer,
         timeout_seconds: timeout_seconds,

diff --git a/lib/crawlscope/context.rb b/lib/crawlscope/context.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 module Crawlscope
-  Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
+  Context = Data.define(:allowed_statuses, :base_url, :concurrency, :fetch_executor, :resolve_target, :resolve_targets, :schema_registry) do
     def fetch(name)
       public_send(name)
     end