From 57375cb12d7940cde3d19a53603274edd7cdf3c8 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Mon, 1 Jun 2026 22:15:06 +0100 Subject: [PATCH 1/3] feat: add bounded async crawl execution Introduce threaded and async fetch executors, route crawl, sitemap, link target, and selected rule work through bounded parallel execution, and document the Ruby 3.3 async transport contract. Also clarify validation reporting so warnings-only runs are reported as warnings, sampled in large groups, and do not fail the CLI exit status. Verification: bundle exec rake test; bundle exec rake standard; mise exec ruby@3.4.8 -- bundle exec rake test; mise exec ruby@3.4.8 -- bundle exec rake standard; bundle exec rake build; bundle exec rake crawlscope:validate URL=http://localhost:3000 --- README.md | 8 + Rakefile | 2 +- UPGRADE.md | 16 +- async-performance-assessment.md | 166 ++++++++ crawlscope.gemspec | 4 +- lib/crawlscope/cli.rb | 9 + lib/crawlscope/configuration.rb | 9 +- lib/crawlscope/context.rb | 2 +- lib/crawlscope/crawl.rb | 80 +++- lib/crawlscope/crawler.rb | 20 +- lib/crawlscope/fetch_executor.rb | 36 ++ lib/crawlscope/fetch_executor/async.rb | 32 ++ lib/crawlscope/fetch_executor/threaded.rb | 32 ++ lib/crawlscope/http.rb | 8 +- lib/crawlscope/reporter.rb | 26 +- lib/crawlscope/result.rb | 2 +- lib/crawlscope/rules/links.rb | 92 ++++- lib/crawlscope/rules/uniqueness.rb | 29 +- lib/crawlscope/sitemap.rb | 41 +- lib/tasks/crawlscope_tasks.rake | 2 +- perf-improvement.md | 365 ++++++++++++++++++ test/crawlscope/cli_test.rb | 19 +- test/crawlscope/configuration_test.rb | 14 + test/crawlscope/crawl_test.rb | 114 +++++- test/crawlscope/crawler_test.rb | 61 +++ test/crawlscope/links_rule_test.rb | 53 +++ test/crawlscope/reporter_test.rb | 86 ++++- test/crawlscope/result_test.rb | 35 ++ test/crawlscope/sitemap_test.rb | 52 +++ test/performance/async_fetch_benchmark.rb | 127 ++++++ test/performance/fetch_executor_matrix.rb | 162 ++++++++ .../sitemap_expansion_benchmark.rb | 121 ++++++ 32 files changed, 1744 insertions(+), 81 deletions(-) create mode 100644 async-performance-assessment.md create mode 100644 lib/crawlscope/fetch_executor.rb create mode 100644 lib/crawlscope/fetch_executor/async.rb create mode 100644 lib/crawlscope/fetch_executor/threaded.rb create mode 100644 perf-improvement.md create mode 100644 test/crawlscope/result_test.rb create mode 100644 test/performance/async_fetch_benchmark.rb create mode 100644 test/performance/fetch_executor_matrix.rb create mode 100644 test/performance/sitemap_expansion_benchmark.rb diff --git a/README.md b/README.md index e172560..c338f15 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ The default rule set includes: ## Installation +Crawlscope requires Ruby 3.3 or newer. + Add this line to your application's Gemfile: ```ruby @@ -166,6 +168,7 @@ Available environment overrides: - `TIMEOUT=30` - `NETWORK_IDLE_TIMEOUT=10` - `CONCURRENCY=5` +- `FETCH_EXECUTOR=threaded` or `FETCH_EXECUTOR=async` Available tasks: @@ -196,6 +199,11 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]' Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the task-argument form above instead. +`FETCH_EXECUTOR=threaded` is the default and works with Crawlscope's required +runtime dependencies. `FETCH_EXECUTOR=async` uses Ruby's fiber scheduler and +Async::HTTP through Faraday, preserving the same `CONCURRENCY` bound. The CLI +accepts the same setting with `--fetch-executor async`. + `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`. ### Structured Data URL Audit diff --git a/Rakefile b/Rakefile index 840ad74..7f3ae01 100644 --- a/Rakefile +++ b/Rakefile @@ -183,7 +183,7 @@ namespace :release do end namespace :crawlscope do - desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" + desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR" task :validate, [:url, :sitemap, :rules] do |_task, args| Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules]) end diff --git a/UPGRADE.md b/UPGRADE.md index 3a6cb49..bbc6f6a 100644 --- a/UPGRADE.md +++ b/UPGRADE.md @@ -4,4 +4,18 @@ Use this file for host-app migration notes when a release changes public contracts, required setup, component locals, generated assets, or runtime behavior. -No special upgrade notes have been published yet. +## Next Release + +### Ruby 3.3 is now required + +Crawlscope now depends on the current Async runtime for production async HTTP +fetching. Host applications must run Ruby 3.3 or newer before upgrading. + +Recommended migration: + +1. Upgrade the host application runtime to Ruby 3.3 or newer. +2. Run `bundle update crawlscope async async-http async-http-faraday`. +3. Keep the default `FETCH_EXECUTOR=threaded` for the first deploy if you want + a conservative rollout. +4. Enable async fetching with `FETCH_EXECUTOR=async` or + `--fetch-executor async` after the app is running on the new Ruby version. diff --git a/async-performance-assessment.md b/async-performance-assessment.md new file mode 100644 index 0000000..e38b1a7 --- /dev/null +++ b/async-performance-assessment.md @@ -0,0 +1,166 @@ +# Async Performance Assessment + +Date: 2026-06-01 + +## Conclusion + +Async HTTP is not at least 2x faster than Crawlscope's existing threaded fetch +baseline at the same concurrency. + +It is much faster than sequential fetching, but Crawlscope was already parallel +before this work. The meaningful comparison is therefore: + +- `fetch_executor: :threaded`, `CONCURRENCY=N` +- `fetch_executor: :async`, `CONCURRENCY=N` + +On that comparison, async is roughly 1.01x to 1.11x faster in most local +delayed-response scenarios, with one Ruby 3.4.8 short-delay/high-concurrency +case reaching 1.35x. It does not reach 2x. + +The additional production work did produce a 2x+ improvement in a different +place: child sitemap expansion now uses the bounded fetch executor instead of +walking sitemap indexes serially. In the local delayed sitemap benchmark, +bounded expansion is about 7.4x to 7.6x faster than sequential expansion on +Ruby 3.4.8 and Ruby 4.0.3. + +## Benchmark Setup + +Benchmarks use a local delayed HTTP server and full `Crawlscope::Crawl` runs, +not isolated HTTP calls. + +Measured scenarios: + +- direct sitemap pages, +- uncrawled internal link targets resolved by the links rule, +- 48 delayed pages or targets, +- 20ms and 80ms response delays, +- concurrency 8 and 16, +- median of 3 runs per executor. + +Scripts: + +- `test/performance/async_fetch_benchmark.rb` +- `test/performance/fetch_executor_matrix.rb` +- `test/performance/sitemap_expansion_benchmark.rb` + +## Results + +### Ruby 4.0.3 + +| Scenario | Threaded | Async | Async vs Threaded | +| --- | ---: | ---: | ---: | +| direct pages, 20ms, c8 | 0.159s | 0.147s | 1.08x | +| direct pages, 20ms, c16 | 0.092s | 0.088s | 1.04x | +| direct pages, 80ms, c8 | 0.528s | 0.521s | 1.01x | +| direct pages, 80ms, c16 | 0.280s | 0.275s | 1.02x | +| link targets, 20ms, c8 | 0.179s | 0.170s | 1.06x | +| link targets, 80ms, c8 | 0.616s | 0.601s | 1.02x | + +### Ruby 3.4.8 + +| Scenario | Threaded | Async | Async vs Threaded | +| --- | ---: | ---: | ---: | +| direct pages, 20ms, c8 | 0.163s | 0.159s | 1.03x | +| direct pages, 20ms, c16 | 0.110s | 0.082s | 1.35x | +| direct pages, 80ms, c8 | 0.524s | 0.518s | 1.01x | +| direct pages, 80ms, c16 | 0.278s | 0.271s | 1.02x | +| link targets, 20ms, c8 | 0.190s | 0.170s | 1.11x | +| link targets, 80ms, c8 | 0.610s | 0.607s | 1.01x | + +The earlier simple benchmark also showed the same pattern: + +| Runtime | Sequential threaded | Threaded c8 | Async c8 | +| --- | ---: | ---: | ---: | +| Ruby 4.0.3 | 2.141s | 0.284s | 0.305s | +| Ruby 3.4.8 | 2.133s | 0.272s | 0.328s | + +### Child Sitemap Expansion + +This benchmark measures a sitemap index with eight delayed child sitemaps. +The first row forces a sequential executor by setting concurrency to 1. The +bounded rows use the same crawl path with concurrency 8. + +| Runtime | Sequential | Threaded c8 | Async c8 | Best Speedup | +| --- | ---: | ---: | ---: | ---: | +| Ruby 4.0.3 | 2.123s | 0.280s | 0.285s | 7.58x | +| Ruby 3.4.8 | 2.100s | 0.284s | 0.276s | 7.61x | + +## Why Async Is Not 2x Faster + +The current threaded implementation is already near the latency lower bound. + +For 48 pages at 80ms delay and concurrency 16, the theoretical network floor is +roughly: + +```text +ceil(48 / 16) * 0.08s = 0.24s +``` + +Measured results: + +- threaded: 0.279s on Ruby 4.0.3 +- async: 0.267s on Ruby 4.0.3 + +That leaves only about 39ms of overhead for threaded execution to eliminate. +Async cannot produce a 2x improvement when the threaded baseline is already +within about 16% of the ideal network floor. + +The main reasons: + +1. Crawlscope already had bounded parallel fetching through + `Concurrent::FixedThreadPool`. +2. The benchmark is I/O latency dominated, not CPU or thread scheduling + dominated. +3. Faraday still adds request/response abstraction overhead in both modes. +4. Local HTTP/1.1 requests do not create a major multiplexing advantage for + async. +5. The async executor improves resource shape more than wall-clock time when + the thread pool is already sized correctly. + +## What Did Improve + +This work still improves the architecture and specific crawl paths: + +- async HTTP is now a real transport through `async-http-faraday`, +- fetch executor selection is explicit and documented, +- browser rendering is guarded from async misuse, +- output ordering remains stable, +- uncrawled link targets are resolved as a bounded batch instead of repeatedly + through the single-target path, +- child sitemap indexes are expanded through the same bounded executor, +- canonical target resolution reuses the link-target cache, +- per-page link extraction and uniqueness summaries can run in bounded parallel + when the selected executor supports it. + +The biggest speed improvement is not async versus threads. It is batching extra +network-bound and per-page rule work through the same bounded executor. In slow +child-sitemap or link-target cases, the batch shape turns what would otherwise +be a serial second wave into roughly `ceil(items / concurrency) * latency`. + +## Recommendation + +Keep the async implementation, but do not claim a 2x wall-clock speedup over +the current threaded executor. + +Position it as: + +- production-ready for HTTP crawling, +- useful for fiber-scheduler deployments, +- potentially better at higher concurrency with lower thread pressure, +- equivalent-to-slightly-faster than threaded for the measured local workloads. + +If the product goal is a reliable 2x improvement over current default threaded +crawls, the next performance work should not be "more async." It should target: + +1. persistent fetch/result caching across rule phases, +2. optional higher concurrency with per-host rate limits, +3. streaming per-page analysis so CPU work overlaps network waits, +4. reducing retained response bodies when rules only need parsed document state. + +## Decision + +Async is production-ready, but it is not a 2x speed feature against the existing +threaded baseline. The production claim should be about scalability and +executor choice. The measured 2x+ speedup comes from applying bounded +parallelism to previously serial crawl phases, especially child sitemap +expansion. diff --git a/crawlscope.gemspec b/crawlscope.gemspec index ccc80c5..5306e0c 100644 --- a/crawlscope.gemspec +++ b/crawlscope.gemspec @@ -12,7 +12,7 @@ Gem::Specification.new do |spec| spec.description = "A small Ruby gem for sitemap-driven SEO validation with structured issues, configurable rules and schema registries, optional browser rendering, and Rails rake task integration." spec.homepage = "https://www.ethos-link.com/opensource/crawlscope" spec.license = "MIT" - spec.required_ruby_version = ">= 3.2.0" + spec.required_ruby_version = ">= 3.3.0" repo = "https://github.com/ethos-link/crawlscope" branch = "main" @@ -46,6 +46,8 @@ Gem::Specification.new do |spec| spec.require_paths = ["lib"] spec.add_dependency "concurrent-ruby", ">= 1.3" + spec.add_dependency "async", ">= 2.0" + spec.add_dependency "async-http-faraday", ">= 0.22" spec.add_dependency "faraday", ">= 2.0" spec.add_dependency "faraday-follow_redirects", ">= 0.3" spec.add_dependency "json-schema", ">= 5.0" diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb index 39f9ac3..dbd2ec4 100644 --- a/lib/crawlscope/cli.rb +++ b/lib/crawlscope/cli.rb @@ -134,6 +134,7 @@ def run_validate configure_renderer(resolved_renderer) @configuration.concurrency = resolved_concurrency + @configuration.fetch_executor = resolved_fetch_executor @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1) @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1) @@ -167,6 +168,10 @@ def run_validate opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value| @configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency") end + + opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value| + @configuration.fetch_executor = value + end end parser.parse!(@argv) @@ -221,6 +226,10 @@ def resolved_concurrency end end + def resolved_fetch_executor + normalized_string(ENV["FETCH_EXECUTOR"]) || @configuration.fetch_executor + end + def resolved_integer(name, default:, minimum:) raw_value = normalized_string(ENV[name]) return default if raw_value.nil? diff --git a/lib/crawlscope/configuration.rb b/lib/crawlscope/configuration.rb index dd6af3e..afb3e8b 100644 --- a/lib/crawlscope/configuration.rb +++ b/lib/crawlscope/configuration.rb @@ -7,10 +7,11 @@ class Configuration DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5 DEFAULT_BROWSER_SCROLL_PAGE = true DEFAULT_CONCURRENCY = 10 + DEFAULT_FETCH_EXECUTOR = :threaded RENDERERS = %i[http browser].freeze DEFAULT_TIMEOUT_SECONDS = 20 - attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds + attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds def allowed_statuses value = resolve(@allowed_statuses) @@ -30,6 +31,11 @@ def concurrency positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency") end + def fetch_executor + value = resolve(@fetch_executor) + FetchExecutor.normalize(value.nil? ? DEFAULT_FETCH_EXECUTOR : value) + end + def browser_concurrency value = concurrency default_value = DEFAULT_BROWSER_CONCURRENCY @@ -83,6 +89,7 @@ def audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names: sitemap_path: sitemap_path, browser_factory: browser_factory, concurrency: concurrency, + fetch_executor: fetch_executor, network_idle_timeout_seconds: network_idle_timeout_seconds, renderer: renderer, timeout_seconds: timeout_seconds, diff --git a/lib/crawlscope/context.rb b/lib/crawlscope/context.rb index 5335eca..1e9d2c4 100644 --- a/lib/crawlscope/context.rb +++ b/lib/crawlscope/context.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true module Crawlscope - Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do + Context = Data.define(:allowed_statuses, :base_url, :concurrency, :fetch_executor, :resolve_target, :resolve_targets, :schema_registry) do def fetch(name) public_send(name) end diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index ad3af48..26fd1da 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -2,13 +2,14 @@ module Crawlscope class Crawl - def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES) + def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES) @base_url = base_url @sitemap_path = sitemap_path @rules = Array(rules) @schema_registry = schema_registry @browser_factory = browser_factory @concurrency = concurrency + @fetch_executor = fetch_executor @network_idle_timeout_seconds = network_idle_timeout_seconds @renderer = renderer.to_sym @scroll_page = scroll_page @@ -17,10 +18,12 @@ def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_facto end def call + validate_fetch_executor! + urls = sitemap_urls @page_fetcher = page - pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls) + pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(urls) issues = IssueCollection.new collect(pages, issues) @@ -41,7 +44,13 @@ def call private def sitemap_urls - urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url) + urls = Sitemap.new( + path: @sitemap_path, + adapter: http_adapter, + concurrency: @concurrency, + fetch_executor: @fetch_executor, + timeout_seconds: @timeout_seconds + ).urls(base_url: @base_url) raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty? urls @@ -62,15 +71,31 @@ def page if @renderer == :browser (@browser_factory || method(:browser)).call else - Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds) + Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds, adapter: http_adapter) end end + def http_adapter + return unless FetchExecutor.normalize(@fetch_executor) == :async + + require "async/http/faraday" + :async_http + end + + def validate_fetch_executor! + return unless @renderer == :browser && FetchExecutor.normalize(@fetch_executor) == :async + + raise ConfigurationError, "Async fetch execution is only supported with http rendering" + end + def context Context.new( allowed_statuses: @allowed_statuses, base_url: @base_url, + concurrency: @concurrency, + fetch_executor: @fetch_executor, resolve_target: method(:resolve), + resolve_targets: method(:resolve_all), schema_registry: @schema_registry ) end @@ -93,11 +118,15 @@ def cache(pages) @targets = {} pages.each do |page| - @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty? - @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty? + cache_page(page) end end + def cache_page(page) + @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty? + @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty? + end + def scan(urls, pages, issues) @rules.each do |rule| rule.call(urls: urls, pages: pages, issues: issues, context: context) @@ -105,17 +134,40 @@ def scan(urls, pages, issues) end def resolve(target_url) - normalized_url = Url.normalize(target_url, base_url: @base_url) - return @targets[normalized_url] if @targets.key?(normalized_url) + resolve_all([target_url]).fetch(target_url) + end - @targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url) + def resolve_all(target_urls) + normalized_by_url = Array(target_urls).to_h do |target_url| + [target_url, Url.normalize(target_url, base_url: @base_url)] + end + normalized_urls = normalized_by_url.values.compact.uniq + missing_urls = [] + + normalized_urls.each do |normalized_url| + next if @targets.key?(normalized_url) + + resolved = resolved_page(normalized_url) + if resolved + @targets[normalized_url] = resolved + else + missing_urls << normalized_url + end + end + + fetched_pages(missing_urls).each do |page| + normalized_url = Url.normalize(page.url, base_url: @base_url) + cache_page(page) + @targets[normalized_url] = resolution(page, normalized_url, crawled: false) + end + + normalized_by_url.to_h { |target_url, normalized_url| [target_url, @targets[normalized_url]] } end - def fetched_page(normalized_url) - page = @page_fetcher.fetch(normalized_url) - @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty? - @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty? - resolution(page, normalized_url, crawled: false) + def fetched_pages(normalized_urls) + return [] if normalized_urls.empty? + + Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(normalized_urls) end def resolved_page(normalized_url) diff --git a/lib/crawlscope/crawler.rb b/lib/crawlscope/crawler.rb index 5da2007..967c308 100644 --- a/lib/crawlscope/crawler.rb +++ b/lib/crawlscope/crawler.rb @@ -1,28 +1,14 @@ # frozen_string_literal: true -require "concurrent" - module Crawlscope class Crawler - def initialize(page_fetcher:, concurrency:) + def initialize(page_fetcher:, concurrency:, fetch_executor: :threaded) @page_fetcher = page_fetcher - @concurrency = concurrency + @fetch_executor = FetchExecutor.build(name: fetch_executor, concurrency: concurrency) end def call(urls) - pages = Concurrent::Array.new - pool = Concurrent::FixedThreadPool.new(@concurrency) - - urls.each do |url| - pool.post do - pages << fetch(url) - end - end - - pool.shutdown - pool.wait_for_termination - - pages.to_a + @fetch_executor.call(urls) { |url| fetch(url) } end private diff --git a/lib/crawlscope/fetch_executor.rb b/lib/crawlscope/fetch_executor.rb new file mode 100644 index 0000000..50b2980 --- /dev/null +++ b/lib/crawlscope/fetch_executor.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Crawlscope + module FetchExecutor + NAMES = %i[threaded async].freeze + + module_function + + def build(name:, concurrency:) + return name if name.respond_to?(:call) + + case normalized_name(name) + when :threaded + Threaded.new(concurrency: concurrency) + when :async + Async.new(concurrency: concurrency) + end + end + + def normalize(name) + return name if name.respond_to?(:call) + + normalized_name(name) + end + + def normalized_name(name) + normalized = name.to_s.strip + normalized = "threaded" if normalized.empty? + + value = normalized.to_sym + return value if NAMES.include?(value) + + raise ConfigurationError, "Crawlscope fetch_executor must be threaded or async" + end + end +end diff --git a/lib/crawlscope/fetch_executor/async.rb b/lib/crawlscope/fetch_executor/async.rb new file mode 100644 index 0000000..fc7cfdb --- /dev/null +++ b/lib/crawlscope/fetch_executor/async.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require "async" +require "async/semaphore" + +module Crawlscope + module FetchExecutor + class Async + def initialize(concurrency:) + @concurrency = concurrency + end + + def call(urls) + indexed_urls = Array(urls).each_with_index.to_a + pages = Array.new(indexed_urls.size) + + Sync do |parent| + semaphore = ::Async::Semaphore.new(@concurrency) + tasks = indexed_urls.map do |url, index| + semaphore.async(parent: parent) do + pages[index] = yield(url) + end + end + + tasks.each(&:wait) + end + + pages + end + end + end +end diff --git a/lib/crawlscope/fetch_executor/threaded.rb b/lib/crawlscope/fetch_executor/threaded.rb new file mode 100644 index 0000000..e9410f8 --- /dev/null +++ b/lib/crawlscope/fetch_executor/threaded.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require "concurrent" + +module Crawlscope + module FetchExecutor + class Threaded + def initialize(concurrency:) + @concurrency = concurrency + end + + def call(urls) + indexed_urls = Array(urls).each_with_index.to_a + pages = Array.new(indexed_urls.size) + mutex = Mutex.new + pool = Concurrent::FixedThreadPool.new(@concurrency) + + indexed_urls.each do |url, index| + pool.post do + page = yield(url) + mutex.synchronize { pages[index] = page } + end + end + + pool.shutdown + pool.wait_for_termination + + pages + end + end + end +end diff --git a/lib/crawlscope/http.rb b/lib/crawlscope/http.rb index c170770..d9eaed7 100644 --- a/lib/crawlscope/http.rb +++ b/lib/crawlscope/http.rb @@ -10,13 +10,18 @@ class Http MAX_REDIRECTS = 5 USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)" - def initialize(base_url:, timeout_seconds:) + def initialize(base_url:, timeout_seconds:, adapter: nil) @base_url = base_url @timeout_seconds = timeout_seconds + @adapter = adapter @connections_by_thread = Concurrent::Map.new end def close + @connections_by_thread.each_value do |connection| + connection.close if connection.respond_to?(:close) + end + @connections_by_thread.clear end @@ -65,6 +70,7 @@ def connection faraday.response :follow_redirects, limit: MAX_REDIRECTS faraday.options.timeout = @timeout_seconds faraday.options.open_timeout = @timeout_seconds + faraday.adapter @adapter if @adapter end end end diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb index 9425c6f..4c8e1c0 100644 --- a/lib/crawlscope/reporter.rb +++ b/lib/crawlscope/reporter.rb @@ -4,6 +4,8 @@ module Crawlscope class Reporter + MAX_ISSUES_PER_GROUP = 20 + def initialize(io:) @io = io end @@ -15,13 +17,13 @@ def report(result) @io.puts("URLs: #{result.urls.size}") @io.puts("Pages: #{result.pages.size}") - if result.ok? + if result.issues.size.zero? @io.puts("Status: OK") return end - @io.puts("Status: FAILED") - @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}") + @io.puts("Status: #{status_for(result.issues)}") + @io.puts("Issues: #{result.issues.size} total (#{severity_summary(result.issues)})") @io.puts("") report_summary(result.issues) @@ -31,6 +33,18 @@ def report(result) private + def status_for(issues) + grouped = issues.by_severity + + if grouped.key?(:error) + "FAILED" + elsif grouped.key?(:warning) + "WARNINGS" + else + "NOTICES" + end + end + def severity_summary(issues) grouped = issues.by_severity return "" if grouped.empty? @@ -59,10 +73,12 @@ def report_issue_groups(issues, base_url:) .each do |(category, code), grouped_issues| @io.puts("#{category} / #{code}: #{grouped_issues.size}") - grouped_issues.each do |issue| + grouped_issues.first(MAX_ISSUES_PER_GROUP).each do |issue| @io.puts(" - #{compact_issue(issue, base_url: base_url)}") end + remaining_count = grouped_issues.size - MAX_ISSUES_PER_GROUP + @io.puts(" ... #{remaining_count} more") if remaining_count.positive? @io.puts("") end end @@ -127,7 +143,7 @@ def relative_url(url, base_url:) end def format_number(value) - return format("%.2f", value) if value.is_a?(Float) + return format("%.3f", value) if value.is_a?(Float) value.to_s end diff --git a/lib/crawlscope/result.rb b/lib/crawlscope/result.rb index eef888e..bf3f522 100644 --- a/lib/crawlscope/result.rb +++ b/lib/crawlscope/result.rb @@ -3,7 +3,7 @@ module Crawlscope Result = Data.define(:base_url, :sitemap_path, :urls, :pages, :issues) do def ok? - issues.none?(&:error?) && issues.none?(&:warning?) && issues.none?(&:notice?) + issues.none?(&:error?) end end end diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index 86cbe75..d725853 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -21,8 +21,13 @@ def initialize def call(urls:, pages:, issues:, context:) @allowed_statuses = context.fetch(:allowed_statuses) @base_url = context.fetch(:base_url) + @concurrency = context_value(context, :concurrency, default: 1) + @fetch_executor = context_value(context, :fetch_executor) @resolve_target = context.fetch(:resolve_target) + @resolve_targets = context.resolve_targets if context.respond_to?(:resolve_targets) + @resolve_targets ||= context[:resolve_targets] if context.respond_to?(:[]) @base_host = URI.parse(@base_url).host + @resolved_targets_by_url = {} links = extract_links(pages) validate_url_hygiene(urls, links, issues) @@ -42,7 +47,8 @@ def contextual_links(doc) end def extract_links(pages) - pages.select(&:html?).flat_map { |page| page_links(page) } + html_pages = pages.select(&:html?) + parallel_map(html_pages) { |page| page_links(page) }.flatten end def page_links(page) @@ -175,9 +181,12 @@ def report_unresolved_target(target_url, grouped_links, issues, resolution) def resolve_links(links, issues) resolved_links = [] + grouped_links_by_target = links.group_by { |link| link[:target_url] } + targets_by_url = resolve_targets(grouped_links_by_target.keys) + @resolved_targets_by_url.merge!(targets_by_url) - links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links| - target = resolve_target(target_url) + grouped_links_by_target.each do |target_url, grouped_links| + target = target_for(target_url) if target.unresolved? report_unresolved_target(target_url, grouped_links, issues, target.resolution) @@ -221,6 +230,23 @@ def resolve_target(target_url) LinkTarget.new(target_url: target_url, resolution: resolution) end + def resolve_targets(target_urls) + if @resolve_targets + @resolve_targets.call(target_urls).to_h do |target_url, resolution| + [target_url, LinkTarget.new(target_url: target_url, resolution: resolution)] + end + else + target_urls.to_h { |target_url| [target_url, resolve_target(target_url)] } + end + end + + def target_for(target_url) + @resolved_targets_by_url.fetch(target_url) do + target = resolve_target(target_url) + @resolved_targets_by_url[target_url] = target + end + end + LinkTarget = Data.define(:target_url, :resolution) do def allowed?(statuses) statuses.include?(status) @@ -424,7 +450,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues) next if reported_urls.include?(final_url) next unless crawlable_path?(link[:final_path]) - target = resolve_target(final_url) + target = target_for(final_url) next unless target.allowed?(@allowed_statuses) && target.html? next if target.noindex? @@ -506,16 +532,14 @@ def validate_canonical_targets(urls, pages, resolved_links, issues) return if sitemap_pages.size < 2 dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links) - - sitemap_pages.each do |page| - canonical_url = canonical_url_for(page) - next if canonical_url.nil? - - target_uri = URI.parse(canonical_url) - next if target_uri.host != @base_host - - canonical_path = Url.path(canonical_url) - if canonical_path && dofollow_counts_by_path[canonical_path].zero? + canonical_entries = canonical_entries_for(sitemap_pages) + @resolved_targets_by_url.merge!(resolve_targets(canonical_entries.map { |entry| entry.fetch(:canonical_url) })) + + canonical_entries.each do |entry| + page = entry.fetch(:page) + canonical_url = entry.fetch(:canonical_url) + canonical_path = entry.fetch(:canonical_path) + if canonical_path && !root_path?(canonical_path) && dofollow_counts_by_path[canonical_path].zero? issues.add( code: :canonical_no_internal_inlinks, severity: :warning, @@ -527,8 +551,24 @@ def validate_canonical_targets(urls, pages, resolved_links, issues) end validate_canonical_target_status(page, canonical_url, issues) + end + end + + def canonical_entries_for(pages) + pages.filter_map do |page| + canonical_url = canonical_url_for(page) + next if canonical_url.nil? + + target_uri = URI.parse(canonical_url) + next if target_uri.host != @base_host + + { + canonical_path: Url.path(canonical_url), + canonical_url: canonical_url, + page: page + } rescue URI::InvalidURIError - next + nil end end @@ -548,8 +588,12 @@ def canonical_url_for(page) Url.normalize(canonical, base_url: page.url) end + def root_path?(path) + path == "/" + end + def validate_canonical_target_status(page, canonical_url, issues) - target = resolve_target(canonical_url) + target = target_for(canonical_url) if target.unresolved? || target.ignored_error? return @@ -575,6 +619,22 @@ def validate_canonical_target_status(page, canonical_url, issues) ) end end + + def context_value(context, name, default: nil) + if context.respond_to?(name) + context.public_send(name) + elsif context.respond_to?(:key?) && context.key?(name) + context[name] + else + default + end + end + + def parallel_map(items, &block) + return items.map(&block) if items.size < 2 || @fetch_executor.nil? || @concurrency.to_i <= 1 + + FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency).call(items, &block) + end end end end diff --git a/lib/crawlscope/rules/uniqueness.rb b/lib/crawlscope/rules/uniqueness.rb index 95519f9..baa43a6 100644 --- a/lib/crawlscope/rules/uniqueness.rb +++ b/lib/crawlscope/rules/uniqueness.rb @@ -26,11 +26,10 @@ def initialize( end def call(urls:, pages:, issues:, context:) - page_summaries = pages.filter_map do |page| - next unless page.html? + @concurrency = context_value(context, :concurrency, default: 1) + @fetch_executor = context_value(context, :fetch_executor) - summary_for(page) - end + page_summaries = summarize_pages(pages) validate_duplicates(page_summaries, issues) validate_near_duplicates(page_summaries, issues) @@ -66,6 +65,12 @@ def summary_for(page) } end + def summarize_pages(pages) + html_pages = pages.select(&:html?) + + parallel_map(html_pages) { |page| summary_for(page) } + end + def validate_duplicates(page_summaries, issues) duplicates_for(page_summaries, :title).each do |value, urls| issues.add( @@ -177,6 +182,22 @@ def shingle_similarity(left, right) intersection_size.to_f / smaller_set_size end + + def context_value(context, name, default: nil) + if context.respond_to?(name) + context.public_send(name) + elsif context.respond_to?(:key?) && context.key?(name) + context[name] + else + default + end + end + + def parallel_map(items, &block) + return items.map(&block) if items.size < 2 || @fetch_executor.nil? || @concurrency.to_i <= 1 + + FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency).call(items, &block) + end end end end diff --git a/lib/crawlscope/sitemap.rb b/lib/crawlscope/sitemap.rb index b2b1eef..8ebfb33 100644 --- a/lib/crawlscope/sitemap.rb +++ b/lib/crawlscope/sitemap.rb @@ -9,20 +9,31 @@ module Crawlscope class Sitemap SITEMAP_NAMESPACE = {"xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"}.freeze - def initialize(path:) + def initialize(path:, adapter: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS) @path = path + @adapter = adapter + @concurrency = concurrency + @fetch_executor = fetch_executor + @timeout_seconds = timeout_seconds end def urls(base_url:) - collect_urls(@path, base_url: base_url, visited: Set.new).uniq + collect_urls(@path, base_url: base_url, visited: Set.new, visited_mutex: Mutex.new).uniq end private - def collect_urls(source, base_url:, visited:) - return [] if visited.include?(source) + def collect_urls(source, base_url:, visited:, visited_mutex:) + already_visited = visited_mutex.synchronize do + if visited.include?(source) + true + else + visited.add(source) + false + end + end + return [] if already_visited - visited.add(source) document = Nokogiri::XML(read(source)) root_name = document.root&.name unless %w[sitemapindex urlset].include?(root_name) @@ -30,10 +41,13 @@ def collect_urls(source, base_url:, visited:) end if root_name == "sitemapindex" - document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node| - child_source = resolve_child_source(source, node.text.to_s.strip, base_url: base_url) - collect_urls(child_source, base_url: base_url, visited: visited) + child_sources = document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).map do |node| + resolve_child_source(source, node.text.to_s.strip, base_url: base_url) end + + fetch_executor.call(child_sources) do |child_source| + collect_urls(child_source, base_url: base_url, visited: visited, visited_mutex: visited_mutex) + end.flatten else document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node| Url.normalize_for_base(node.text.to_s.strip, base_url: base_url) @@ -77,11 +91,16 @@ def local_child_path(parent_source, child_loc) end def connection - @connection ||= Faraday.new do |faraday| + Faraday.new do |faraday| faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS - faraday.options.timeout = 20 - faraday.options.open_timeout = 20 + faraday.options.timeout = @timeout_seconds + faraday.options.open_timeout = @timeout_seconds + faraday.adapter @adapter if @adapter end end + + def fetch_executor + @fetch_executor_instance ||= FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency) + end end end diff --git a/lib/tasks/crawlscope_tasks.rake b/lib/tasks/crawlscope_tasks.rake index 80ad80c..ae12dd1 100644 --- a/lib/tasks/crawlscope_tasks.rake +++ b/lib/tasks/crawlscope_tasks.rake @@ -1,5 +1,5 @@ namespace :crawlscope do - desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" + desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR" task :validate, [:url, :sitemap, :rules] => :environment do |_task, args| Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules]) end diff --git a/perf-improvement.md b/perf-improvement.md new file mode 100644 index 0000000..6b7dd0d --- /dev/null +++ b/perf-improvement.md @@ -0,0 +1,365 @@ +# Crawlscope Performance Improvement Plans + +Date: 2026-06-01 + +## Executive Summary + +Crawlscope is already partially parallel: `Crawlscope::Crawler` uses a +`Concurrent::FixedThreadPool` to fetch sitemap URLs, and HTTP connections are +cached per worker thread. The remaining time growth is likely coming from three +places: + +1. Sitemap expansion is recursive and sequential. +2. Rule execution is sequential after the crawl completes. +3. The links rule can trigger additional target resolution fetches after the + main crawl, also through a sequential rule path. + +The best next step is not "parallelize everything." The best next step is a +bounded async fetch layer that unifies page fetches, child sitemap fetches, and +link target resolution behind one concurrency budget, while keeping analysis +mostly deterministic and sequential until rules are explicitly classified as +parallel-safe. + +Recommendation: implement Plan 1 first, then adopt the safe parts of Plan 2. +Defer Plan 3 until benchmarks prove the graph-level complexity is needed. + +## Implementation Update + +Plan 1 has been implemented with the deliberate Ruby `>= 3.3` contract change. +The default executor remains threaded, and `fetch_executor: :async` is now +available for HTTP crawls through `async-http-faraday`. + +The safe follow-on parallelization work is also implemented: + +- child sitemap indexes expand through the bounded fetch executor, +- extra link target fetches are resolved in bounded batches, +- canonical target checks reuse the link target cache, +- link extraction and uniqueness summaries use bounded parallel maps. + +Measured outcome: async is production-ready, but async alone is not 2x faster +than the existing threaded executor. The 2x+ improvement comes from applying +bounded parallelism to previously serial phases, especially child sitemap +expansion. See `async-performance-assessment.md` for current benchmark numbers. + +## Current Implementation Evidence + +- `lib/crawlscope/crawler.rb` delegates URL scheduling to + `Crawlscope::FetchExecutor`. +- `lib/crawlscope/http.rb` uses Faraday and can run through the async HTTP + adapter for `fetch_executor: :async`. +- `lib/crawlscope/crawl.rb` currently runs in this order: read sitemap URLs, + crawl all pages, collect fetch errors, cache pages, then scan rules. +- `lib/crawlscope/crawl.rb` calls each rule sequentially in `scan`. +- `lib/crawlscope/sitemap.rb` recursively expands child sitemaps in sequence. +- `lib/crawlscope/rules/links.rb` groups links and resolves unique internal + targets through the crawl context, which can batch-fetch uncrawled target + pages. +- `lib/crawlscope/issue_collection.rb` is a simple collection around an Array, + suitable for current sequential rule execution but not safe as a shared sink + for parallel rule writers without synchronization or per-task collections. +- The gemspec currently requires Ruby `>= 3.3.0`. + +## Async Ecosystem Notes + +Async is now a mature fiber-scheduler stack, not a speculative experiment. Its +current docs describe it as a composable asynchronous I/O framework based on +`io-event`, with lightweight fiber concurrency and multi-thread/process +containers available when actual parallelism is needed. + +Relevant current constraints: + +- `async` 2.39.0 was released on April 5, 2026 and requires Ruby `>= 3.3`. +- `async-http` 0.95.1 was released on May 6, 2026 and requires Ruby `>= 3.3`. +- `async-http-faraday` 0.22.2 was released on April 10, 2026 and requires Ruby + `>= 3.3`. +- Current Crawlscope now supports Ruby `>= 3.3.0`, so adding the latest Async + gems as runtime dependencies is an explicit public compatibility change. + +Implication: async work should either: + +- bump Crawlscope's Ruby requirement to `>= 3.3` in a planned contract change, + with `UPGRADE.md` guidance; or +- keep the current threaded Faraday path as the default while exposing async as + an opt-in executor. + +## Plan 1: Bounded Async Fetch Layer + +Replace the current fetch executor with a small fetch orchestration abstraction: + +- Keep `Crawlscope::Crawler.call(urls)` as the public boundary. +- Add a `FetchExecutor` interface with threaded and async implementations. +- Use the threaded implementation as the default. +- Add an async implementation using `async` and either `async-http` directly or + `async-http-faraday`. +- Apply one shared concurrency budget to: + - sitemap URL page fetches, + - child sitemap fetches, + - link target resolution fetches. +- Deduplicate normalized URLs before scheduling fetches. +- Preserve deterministic output by sorting returned pages back to input URL + order, or by sorting by normalized URL before rule analysis. +- Keep rule execution sequential for now. + +### Why This Fits The Current Code + +The existing code already has a natural page-fetch boundary: +`Crawler.new(page_fetcher:, concurrency:).call(urls)`. The HTTP fetcher already +encapsulates transport details. The crawl context already resolves extra link +targets through one `resolve_target` callable. That means the highest-leverage +change is to make all URL I/O share a smarter scheduler, not to rewrite every +rule. + +### Expected Measurable Effect + +- Wall-clock time should improve most on I/O-heavy crawls with many URLs, + redirects, missing pages, or internal links not already in the sitemap. +- CPU-heavy rule time will not improve much. +- Memory usage should stay bounded because one concurrency budget controls all + network fan-out. +- Determinism can remain high if page ordering and issue ordering are + normalized before reporting. + +### Reductio Ad Absurdum + +Assume this plan is pushed to the extreme: every discovered URL and link target +is scheduled immediately with unbounded async tasks. The result is faster only +on a toy site. On a real site it creates too many open sockets, hits rate +limits, destabilizes browser mode, and makes failures look nondeterministic. + +Therefore the core of the plan cannot be "async means unlimited." It must be +"async with explicit global and per-host bounds." If that constraint is kept, +the contradiction disappears and the plan remains valid. + +### Implementation Steps + +1. Add a benchmark fixture with delayed sitemap pages and delayed link targets. +2. Extract current thread-pool logic into `Crawlscope::FetchExecutor::Threaded`. +3. Add a shared URL cache keyed by normalized requested URL and normalized final + URL. +4. Route `Crawler` and `Crawl#resolve` through the shared executor/cache. +5. Add optional `Crawlscope::FetchExecutor::Async` behind a feature flag or + renderer/transport setting. +6. Keep issue collection and rule execution unchanged. +7. Document Ruby 3.3 requirements for the async transport and release + contract. + +## Plan 2: Split Crawl And Analyze Pipeline + +Turn the current "crawl everything, then analyze everything" flow into a +two-phase pipeline: + +- Fetch pages concurrently. +- As each page finishes, run per-page rules immediately: + - indexability, + - metadata, + - structured data, + - content quality checks that inspect only one page. +- Store per-page issues in page-local collections. +- After all pages finish, run aggregate rules: + - uniqueness, + - link graph checks, + - orphan/internal inlink checks. +- Merge issue collections in deterministic order. + +### Why This Fits The Current Code + +Several rules already accept the same broad signature: + +```ruby +rule.call(urls: urls, pages: pages, issues: issues, context: context) +``` + +That interface is convenient, but it hides whether a rule is per-page or +aggregate. Splitting rules by capability would let Crawlscope overlap useful +analysis with network wait time while keeping global checks accurate. + +### Expected Measurable Effect + +- Users get earlier partial results in future streaming/reporting modes. +- Wall-clock time improves when per-page parsing/checking is non-trivial and + network latency leaves idle CPU time. +- Total runtime may improve less than Plan 1 if most time is still network. +- Implementation risk is moderate because rule contracts need classification. + +### Reductio Ad Absurdum + +Assume every rule is treated as page-local and run as soon as each page arrives. +Uniqueness becomes wrong because it needs all pages. Link graph checks become +wrong because inbound counts need all resolved links. Orphan-page detection can +report false positives before all links have been seen. + +Therefore the plan is only valid if Crawlscope makes rule phase explicit: +`per_page`, `post_crawl`, or `graph`. Without that classification, faster +analysis produces faster wrong answers. + +### Implementation Steps + +1. Add rule metadata, for example `phase: :per_page` or `phase: :aggregate`. +2. Keep existing rules aggregate by default. +3. Move only obviously local rules into `:per_page` after focused tests. +4. Change `IssueCollection` usage to per-task collections, then merge in stable + order. +5. Add tests proving aggregate rules still see the complete page set. +6. Add timing instrumentation for crawl, per-page analysis, and aggregate + analysis. + +## Plan 3: Unified Async Crawl Graph + +Model the crawl as a bounded async graph rather than a linear crawl: + +- Fetch the root sitemap. +- Fetch child sitemaps concurrently. +- Schedule page fetches as URLs are discovered. +- Extract links as pages arrive. +- Schedule uncrawled internal link targets through the same bounded queue. +- Run page-local checks as soon as a page arrives. +- Run graph/global checks after the queue drains. +- Maintain one canonical `PageStore` and `TargetResolutionStore`. + +### Why This Fits The Problem + +The slowest large crawls are not just "N sitemap URLs." They are closer to a +graph: + +- a root sitemap points to child sitemaps, +- pages point to internal targets, +- redirects create final URLs, +- rules need both requested and final URL identity. + +A graph executor can remove duplicate work and prevent the current shape where +extra link-target fetches happen later as a second network wave. + +### Expected Measurable Effect + +- Best theoretical wall-clock reduction for large sites with many child + sitemaps and link targets. +- Better dedupe because sitemap URLs, final URLs, and link targets share one + store. +- More complex failure semantics: cancellation, retries, partial graph state, + and stable issue ordering must be designed up front. + +### Reductio Ad Absurdum + +Assume the crawl graph eagerly follows every internal link, every redirect, and +every discovered target until no new URL remains. Crawlscope stops being a +sitemap validation tool and becomes a general web crawler. Runtime becomes +unbounded, robots/rate-limit behavior becomes harder to reason about, and the +reported URL set drifts away from the sitemap contract. + +Therefore the graph must be constrained by Crawlscope's product boundary: +sitemap URLs are the primary audit set; extra target resolution exists only to +validate links from those pages. The graph executor is valid only if it refuses +to become an unlimited site crawler. + +### Implementation Steps + +1. Define graph scope formally: + - audit URLs from sitemap, + - child sitemap URLs, + - internal link targets from audited pages, + - no recursive full-site expansion unless explicitly configured. +2. Add `PageStore` and `ResolutionStore` with stable URL normalization. +3. Add a bounded async work queue with global and per-host limits. +4. Add cancellation and timeout policy at the graph level. +5. Add stable result materialization before calling aggregate rules. +6. Add benchmark coverage for: + - many child sitemaps, + - many repeated internal links, + - redirects, + - broken targets, + - slow responses. + +## Ratings + +Scores are 1-5, where 5 is best. + +| Aspect | Plan 1: Bounded Async Fetch Layer | Plan 2: Split Crawl/Analyze | Plan 3: Unified Async Crawl Graph | +| --- | ---: | ---: | ---: | +| Expected wall-clock improvement | 4 | 3 | 5 | +| Time to implement | 4 | 3 | 1 | +| Correctness risk | 4 | 3 | 2 | +| Deterministic reporting | 4 | 3 | 2 | +| Public API stability | 4 | 3 | 2 | +| Ruby 3.3 compatibility path | 4 | 5 | 3 | +| Async ecosystem leverage | 4 | 2 | 5 | +| Memory/socket boundability | 4 | 4 | 3 | +| Testability | 4 | 3 | 2 | +| Observability potential | 4 | 5 | 5 | +| Overall score | 40 | 34 | 30 | + +## Recommendation + +Implement Plan 1 first. + +It attacks the most likely bottleneck, keeps the current public shape intact, +and can be shipped in small steps. It also creates the infrastructure needed by +the other two plans: a shared fetch executor, URL dedupe, timing metrics, and +transport abstraction. + +Use this sequence: + +1. Add timing instrumentation and benchmark fixtures before changing behavior. +2. Refactor the existing threaded fetcher into an explicit executor. +3. Share the executor/cache between initial sitemap page fetches and link + target resolution. +4. Add async transport with the deliberate Ruby `>= 3.3` contract change + documented in `UPGRADE.md`. +5. After the fetch layer is measurable, implement the safe subset of Plan 2: + classify rules by phase and move only clearly per-page rules into overlapped + analysis. + +Do not start with Plan 3. It is attractive architecturally, but it creates too +many new semantics before we have measurements. The reductio failure mode is +also severe: Crawlscope could quietly become an unbounded crawler instead of a +sitemap-driven validator. + +## Metrics To Track + +Add these measurements before and after each step: + +- total runtime, +- sitemap expansion time, +- audited page fetch time, +- link target resolution time, +- per-rule analysis time, +- total requested URLs, +- unique fetched URLs, +- duplicate suppressed URLs, +- max active fetches, +- max active fetches per host, +- socket/open connection count if available, +- issue count by code, +- output ordering stability across repeated runs. + +## Acceptance Criteria + +Plan 1 is successful when: + +- a delayed-response benchmark shows reduced wall-clock time at the same issue + count, +- output is stable across repeated runs, +- `CONCURRENCY=1` preserves current sequential semantics, +- `CONCURRENCY=N` never schedules more than N active network fetches globally, +- link target resolution reuses already fetched pages, +- Ruby 3.3+ users have a working threaded default and an opt-in async executor. + +## Sources + +- Local implementation: `lib/crawlscope/crawler.rb`, + `lib/crawlscope/crawl.rb`, `lib/crawlscope/http.rb`, + `lib/crawlscope/sitemap.rb`, `lib/crawlscope/rules/links.rb`, + `lib/crawlscope/issue_collection.rb`, `crawlscope.gemspec`. +- Async docs: https://socketry.github.io/async/ +- Async getting started: https://socketry.github.io/async/guides/getting-started/ +- Async scheduler docs: + https://socketry.github.io/async/source/Async/Scheduler/index.html +- Async::HTTP getting started: + https://socketry.github.io/async-http/guides/getting-started/index +- Async::HTTP::Faraday getting started: + https://socketry.github.io/async-http-faraday/guides/getting-started/index.html +- RubyGems async 2.39.0: + https://rubygems.org/gems/async/versions/2.39.0 +- RubyGems async-http 0.95.1: + https://rubygems.org/gems/async-http +- RubyGems async-http-faraday 0.22.2: + https://rubygems.org/gems/async-http-faraday/versions/0.22.2 diff --git a/test/crawlscope/cli_test.rb b/test/crawlscope/cli_test.rb index b520755..a91a92a 100644 --- a/test/crawlscope/cli_test.rb +++ b/test/crawlscope/cli_test.rb @@ -4,11 +4,12 @@ class CrawlscopeCliTest < Minitest::Test class FakeConfiguration - attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds + attr_accessor :base_url, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds def initialize @base_url = nil @concurrency = 10 + @fetch_executor = :threaded @network_idle_timeout_seconds = 5 @renderer = :http @timeout_seconds = 20 @@ -95,7 +96,7 @@ def test_validate_passes_arguments_to_task err = StringIO.new status = Crawlscope::Cli.start( - ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"], + ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3", "--fetch-executor", "async"], out: out, err: err, configuration: configuration, @@ -115,10 +116,24 @@ def test_validate_passes_arguments_to_task assert_equal 30, configuration.timeout_seconds assert_equal 9, configuration.network_idle_timeout_seconds assert_equal 3, configuration.concurrency + assert_equal "async", configuration.fetch_executor assert_same out, configuration.output assert_empty err.string end + def test_validate_reads_fetch_executor_from_environment + configuration = FakeConfiguration.new + task = FakeTask.new + + with_env("FETCH_EXECUTOR" => "async") do + status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task) + + assert_equal 0, status + end + + assert_equal "async", configuration.fetch_executor + end + def test_ldjson_reads_urls_from_environment configuration = FakeConfiguration.new task = FakeTask.new diff --git a/test/crawlscope/configuration_test.rb b/test/crawlscope/configuration_test.rb index 31ac7ed..2b5efa2 100644 --- a/test/crawlscope/configuration_test.rb +++ b/test/crawlscope/configuration_test.rb @@ -13,6 +13,7 @@ def test_audit_builds_from_configured_callables config.sitemap_path = -> { "/tmp/sitemap.xml" } config.site_name = -> { "Example" } config.concurrency = -> { 4 } + config.fetch_executor = -> { :threaded } end audit = Crawlscope.configuration.audit @@ -20,6 +21,7 @@ def test_audit_builds_from_configured_callables assert_equal "https://example.com", audit.instance_variable_get(:@base_url) assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path) assert_equal 4, audit.instance_variable_get(:@concurrency) + assert_equal :threaded, audit.instance_variable_get(:@fetch_executor) assert_equal %i[ indexability metadata @@ -55,6 +57,7 @@ def test_defaults_are_normalized assert_equal [200, 301, 302], config.allowed_statuses assert_equal 10, config.concurrency + assert_equal :threaded, config.fetch_executor assert_equal 4, config.browser_concurrency assert_equal 5, config.network_idle_timeout_seconds assert_equal :http, config.renderer @@ -67,6 +70,7 @@ def test_configured_values_are_normalized config = Crawlscope::Configuration.new config.allowed_statuses = ["200", "404"] config.concurrency = "2" + config.fetch_executor = "async" config.network_idle_timeout_seconds = "7" config.renderer = "browser" config.timeout_seconds = "9" @@ -74,6 +78,7 @@ def test_configured_values_are_normalized assert_equal [200, 404], config.allowed_statuses assert_equal 2, config.concurrency + assert_equal :async, config.fetch_executor assert_equal 2, config.browser_concurrency assert_equal 7, config.network_idle_timeout_seconds assert_equal :browser, config.renderer @@ -90,6 +95,15 @@ def test_renderer_must_be_supported assert_equal "Crawlscope renderer must be http or browser", error.message end + def test_fetch_executor_must_be_supported + config = Crawlscope::Configuration.new + config.fetch_executor = "processes" + + error = assert_raises(Crawlscope::ConfigurationError) { config.fetch_executor } + + assert_equal "Crawlscope fetch_executor must be threaded or async", error.message + end + def test_numeric_values_must_be_positive_integers config = Crawlscope::Configuration.new config.concurrency = "0" diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb index 6940107..8b94255 100644 --- a/test/crawlscope/crawl_test.rb +++ b/test/crawlscope/crawl_test.rb @@ -3,6 +3,36 @@ require "test_helper" class CrawlscopeCrawlTest < Minitest::Test + class RecordingExecutor + attr_reader :batches + + def initialize + @batches = [] + end + + def call(urls) + @batches << urls + urls.map { |url| yield(url) } + end + end + + class PageMapFetcher + attr_reader :closed + + def initialize(pages) + @pages = pages + @closed = false + end + + def close + @closed = true + end + + def fetch(url) + @pages.fetch(url) + end + end + def setup @tmp_dir = Dir.mktmpdir @sitemap_path = File.join(@tmp_dir, "sitemap.xml") @@ -100,7 +130,7 @@ def test_collects_metadata_issues_for_invalid_page schema_registry: Crawlscope::SchemaRegistry.default ).call - refute result.ok? + assert result.ok? assert_equal %i[ incomplete_open_graph_tags meta_description_too_long @@ -189,6 +219,31 @@ def fetch(url) assert fake_browser.closed end + def test_async_executor_requires_http_renderer + File.write( + @sitemap_path, + <<~XML + + + https://example.com/pricing + + XML + ) + + error = assert_raises(Crawlscope::ConfigurationError) do + Crawlscope::Crawl.new( + base_url: "https://example.com", + sitemap_path: @sitemap_path, + rules: [], + schema_registry: Crawlscope::SchemaRegistry.default, + renderer: :browser, + fetch_executor: :async + ).call + end + + assert_equal "Async fetch execution is only supported with http rendering", error.message + end + def test_reports_sitemap_redirect_url File.write( @sitemap_path, @@ -214,4 +269,61 @@ def test_reports_sitemap_redirect_url assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url end + + def test_resolves_uncrawled_link_targets_as_a_bounded_batch + File.write( + @sitemap_path, + <<~XML + + + https://example.com/guide + + XML + ) + + executor = RecordingExecutor.new + fetcher = PageMapFetcher.new( + "https://example.com/guide" => page( + "https://example.com/guide", + "
OneTwo
" + ), + "https://example.com/one" => page("https://example.com/one", "
One
"), + "https://example.com/two" => page("https://example.com/two", "
Two
") + ) + + Crawlscope::Crawl.new( + base_url: "https://example.com", + sitemap_path: @sitemap_path, + rules: [Crawlscope::Rules::Links.new], + schema_registry: Crawlscope::SchemaRegistry.default, + renderer: :browser, + browser_factory: -> { fetcher }, + fetch_executor: executor, + concurrency: 2 + ).call + + assert_equal( + [ + ["https://example.com/guide"], + ["https://example.com/one", "https://example.com/two"] + ], + executor.batches + ) + assert fetcher.closed + end + + private + + def page(url, body) + Crawlscope::Page.new( + url: url, + normalized_url: url, + final_url: url, + normalized_final_url: url, + status: 200, + headers: {"content-type" => "text/html"}, + body: body, + doc: Nokogiri::HTML(body) + ) + end end diff --git a/test/crawlscope/crawler_test.rb b/test/crawlscope/crawler_test.rb index cce4573..fc7b774 100644 --- a/test/crawlscope/crawler_test.rb +++ b/test/crawlscope/crawler_test.rb @@ -31,4 +31,65 @@ def test_returns_error_page_when_fetcher_raises assert_nil error_page.status assert_equal "Timeout::Error: fetch timed out", error_page.error end + + def test_preserves_input_order + pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call( + ["https://example.com/one", "https://example.com/two", "https://example.com/three"] + ) + + assert_equal( + ["https://example.com/one", "https://example.com/two", "https://example.com/three"], + pages.map(&:url) + ) + end + + class AsyncFetcher + attr_reader :active_fetches + + def initialize + @active_fetches = 0 + @max_active_fetches = 0 + @mutex = Mutex.new + end + + def fetch(url) + @mutex.synchronize do + @active_fetches += 1 + @max_active_fetches = [@max_active_fetches, @active_fetches].max + end + + Async::Task.current.sleep(0.01) + + Crawlscope::Page.new( + url: url, + normalized_url: url, + final_url: url, + normalized_final_url: url, + status: 200, + headers: {}, + body: "", + doc: Nokogiri::HTML("") + ) + ensure + @mutex.synchronize { @active_fetches -= 1 } + end + + def max_active_fetches + @mutex.synchronize { @max_active_fetches } + end + end + + def test_async_executor_respects_concurrency_and_preserves_order + fetcher = AsyncFetcher.new + + pages = Crawlscope::Crawler.new(page_fetcher: fetcher, concurrency: 2, fetch_executor: :async).call( + ["https://example.com/one", "https://example.com/two", "https://example.com/three"] + ) + + assert_equal( + ["https://example.com/one", "https://example.com/two", "https://example.com/three"], + pages.map(&:url) + ) + assert_operator fetcher.max_active_fetches, :<=, 2 + end end diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index 78119ac..c779e72 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -218,6 +218,33 @@ def test_reports_canonical_target_link_issues assert_includes codes, :canonical_points_to_redirect end + def test_does_not_report_missing_inlinks_for_root_canonical + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + {crawled: true, error: nil, final_url: target_url, html: true, status: 200} + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/", "https://example.com/about"], + pages: [ + page( + url: "https://example.com/", + body: <<~HTML + + +
About
+ + HTML + ), + page(url: "https://example.com/about", body: "
Home
") + ], + issues: issues, + context: context(resolver: resolver) + ) + + refute_includes issues.to_a.map(&:code), :canonical_no_internal_inlinks + end + def test_reports_indexable_internal_pages_missing_from_sitemap issues = Crawlscope::IssueCollection.new resolver = lambda do |target_url| @@ -348,6 +375,32 @@ def test_reports_internal_links_that_redirect assert_includes redirect_issue.message, "https://example.com/pricing" end + def test_reuses_link_target_resolution_for_later_link_checks + issues = Crawlscope::IssueCollection.new + resolution_counts = Hash.new(0) + resolver = lambda do |target_url| + resolution_counts[target_url] += 1 + { + crawled: false, + doc: Nokogiri::HTML("
Hidden
"), + error: nil, + final_url: target_url, + headers: {}, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "
Hidden
")], + issues: issues, + context: context(resolver: resolver) + ) + + assert_equal 1, resolution_counts.fetch("https://example.com/hidden") + end + def test_ignores_links_that_should_not_be_crawled issues = Crawlscope::IssueCollection.new diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb index 6a5c3d1..1d7d620 100644 --- a/test/crawlscope/reporter_test.rb +++ b/test/crawlscope/reporter_test.rb @@ -23,7 +23,7 @@ def test_reports_ok_result refute_includes output, "Status: FAILED" end - def test_reports_failed_result_with_grouped_one_line_issues + def test_reports_warning_result_with_grouped_one_line_issues io = StringIO.new issues = Crawlscope::IssueCollection.new 4.times do |index| @@ -54,8 +54,9 @@ def test_reports_failed_result_with_grouped_one_line_issues output = io.string - assert_includes output, "Status: FAILED" - assert_includes output, "Issues: 5 5 warnings" + assert_includes output, "Status: WARNINGS" + refute_includes output, "Status: FAILED" + assert_includes output, "Issues: 5 total (5 warnings)" assert_includes output, "Summary:" assert_includes output, "links / low_dofollow_inlinks: 4" assert_includes output, " - /page-1 inbound 1/2 sources: /source-1" @@ -66,6 +67,85 @@ def test_reports_failed_result_with_grouped_one_line_issues refute_includes output, "... 1 more" end + def test_reports_failed_status_when_errors_are_present + io = StringIO.new + issues = Crawlscope::IssueCollection.new + issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com/a", message: "timeout", details: {}) + issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing ", details: {}) + + result = Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com/a"], + pages: [Object.new], + issues: issues + ) + + Crawlscope::Reporter.new(io: io).report(result) + + output = io.string + + assert_includes output, "Status: FAILED" + assert_includes output, "Issues: 2 total (1 error, 1 warning)" + end + + def test_limits_large_issue_groups + io = StringIO.new + issues = Crawlscope::IssueCollection.new + 21.times do |index| + issues.add( + code: :low_dofollow_inlinks, + severity: :warning, + category: :links, + url: "https://example.com/page-#{index + 1}", + message: "dofollow inbound links 1 below 2", + details: {dofollow_inbound_count: 1, minimum: 2} + ) + end + + result = Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com"], + pages: [Object.new], + issues: issues + ) + + Crawlscope::Reporter.new(io: io).report(result) + + output = io.string + + assert_includes output, "links / low_dofollow_inlinks: 21" + assert_includes output, " - /page-20 inbound 1/2" + refute_includes output, " - /page-21" + assert_includes output, " ... 1 more" + end + + def test_reports_ratio_with_enough_precision_to_show_threshold_difference + io = StringIO.new + issues = Crawlscope::IssueCollection.new + issues.add( + code: :low_unique_token_ratio, + severity: :warning, + category: :content_quality, + url: "https://example.com/a", + message: "visible text has low token variety", + details: {ratio: 0.249, threshold: 0.25} + ) + + result = Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com/a"], + pages: [Object.new], + issues: issues + ) + + Crawlscope::Reporter.new(io: io).report(result) + + assert_includes io.string, "ratio 0.249/0.250" + end + def test_reports_source_details_on_one_line io = StringIO.new issues = Crawlscope::IssueCollection.new diff --git a/test/crawlscope/result_test.rb b/test/crawlscope/result_test.rb new file mode 100644 index 0000000..f8678f9 --- /dev/null +++ b/test/crawlscope/result_test.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require "test_helper" + +class CrawlscopeResultTest < Minitest::Test + def test_ok_when_result_has_warnings_only + issues = Crawlscope::IssueCollection.new + issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com", message: "missing <title>", details: {}) + + result = result_with(issues) + + assert result.ok? + end + + def test_not_ok_when_result_has_errors + issues = Crawlscope::IssueCollection.new + issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com", message: "timeout", details: {}) + + result = result_with(issues) + + refute result.ok? + end + + private + + def result_with(issues) + Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com"], + pages: [Object.new], + issues: issues + ) + end +end diff --git a/test/crawlscope/sitemap_test.rb b/test/crawlscope/sitemap_test.rb index 33c8151..ca83218 100644 --- a/test/crawlscope/sitemap_test.rb +++ b/test/crawlscope/sitemap_test.rb @@ -3,6 +3,19 @@ require "test_helper" class CrawlscopeSitemapTest < Minitest::Test + class RecordingExecutor + attr_reader :batches + + def initialize + @batches = [] + end + + def call(items) + @batches << items + items.map { |item| yield(item) } + end + end + def test_parses_remote_sitemap_urlset stub_request(:get, "https://www.example.com/sitemap.xml") .to_return( @@ -127,4 +140,43 @@ def test_parses_local_sitemap_index_with_absolute_child_sitemap_loc assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000") end end + + def test_child_sitemaps_are_collected_through_the_fetch_executor + Dir.mktmpdir do |dir| + File.write( + File.join(dir, "sitemap.xml"), + <<~XML + <?xml version="1.0" encoding="UTF-8"?> + <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <sitemap><loc>first.xml</loc></sitemap> + <sitemap><loc>second.xml</loc></sitemap> + </sitemapindex> + XML + ) + File.write( + File.join(dir, "first.xml"), + <<~XML + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <url><loc>http://localhost:3000/first</loc></url> + </urlset> + XML + ) + File.write( + File.join(dir, "second.xml"), + <<~XML + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <url><loc>http://localhost:3000/second</loc></url> + </urlset> + XML + ) + + executor = RecordingExecutor.new + parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"), fetch_executor: executor) + + assert_equal ["http://localhost:3000/first", "http://localhost:3000/second"], parser.urls(base_url: "http://localhost:3000") + assert_equal [[File.join(dir, "first.xml"), File.join(dir, "second.xml")]], executor.batches + end + end end diff --git a/test/performance/async_fetch_benchmark.rb b/test/performance/async_fetch_benchmark.rb new file mode 100644 index 0000000..d2315bd --- /dev/null +++ b/test/performance/async_fetch_benchmark.rb @@ -0,0 +1,127 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../../lib", __dir__) + +require "bundler/setup" +require "crawlscope" +require "json" +require "socket" +require "time" + +class DelayedHttpServer + attr_reader :base_url + + def initialize(page_count:, delay_seconds:) + @page_count = page_count + @delay_seconds = delay_seconds + @server = TCPServer.new("127.0.0.1", 0) + @base_url = "http://127.0.0.1:#{@server.addr[1]}" + @threads = [] + end + + def start + @thread = Thread.new do + loop do + socket = @server.accept + @threads << Thread.new(socket) { |client| respond(client) } + rescue IOError + break + end + end + end + + def stop + @server.close + @thread&.join + @threads.each(&:join) + end + + private + + def respond(socket) + request_line = socket.gets.to_s + path = request_line.split[1].to_s + read_headers(socket) + + if path == "/sitemap.xml" + write_response(socket, sitemap_xml, content_type: "application/xml") + else + sleep @delay_seconds + write_response(socket, page_html(path), content_type: "text/html") + end + ensure + socket.close + end + + def read_headers(socket) + loop do + line = socket.gets + break if line.nil? || line == "\r\n" + end + end + + def sitemap_xml + urls = (1..@page_count).map do |index| + "<url><loc>#{@base_url}/pages/#{index}</loc></url>" + end.join + + %(<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">#{urls}</urlset>) + end + + def page_html(path) + <<~HTML + <html> + <head><title>#{path} +

#{path}

#{path} benchmark page

+ + HTML + end + + def write_response(socket, body, content_type:) + socket.write "HTTP/1.1 200 OK\r\n" + socket.write "Content-Type: #{content_type}\r\n" + socket.write "Content-Length: #{body.bytesize}\r\n" + socket.write "Connection: close\r\n" + socket.write "\r\n" + socket.write body + end +end + +def measure(name, base_url:, concurrency:, fetch_executor:) + started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + result = Crawlscope::Crawl.new( + base_url: base_url, + sitemap_path: "#{base_url}/sitemap.xml", + rules: [], + schema_registry: Crawlscope::SchemaRegistry.default, + concurrency: concurrency, + fetch_executor: fetch_executor + ).call + + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at + [name, {seconds: elapsed.round(3), pages: result.pages.size, issues: result.issues.size}] +end + +server = DelayedHttpServer.new(page_count: 24, delay_seconds: 0.08) +server.start + +begin + results = {} + [ + measure("threaded_concurrency_1", base_url: server.base_url, concurrency: 1, fetch_executor: :threaded), + measure("threaded_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :threaded), + measure("async_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :async) + ].each { |name, result| results[name] = result } + + sequential = results.fetch("threaded_concurrency_1").fetch(:seconds) + threaded = results.fetch("threaded_concurrency_8").fetch(:seconds) + async = results.fetch("async_concurrency_8").fetch(:seconds) + + abort "async benchmark failed: async was not meaningfully faster than sequential" unless async < sequential * 0.6 + abort "async benchmark failed: async was more than 2x slower than threaded" if async > threaded * 2.0 + + puts JSON.pretty_generate(results) +ensure + server.stop +end diff --git a/test/performance/fetch_executor_matrix.rb b/test/performance/fetch_executor_matrix.rb new file mode 100644 index 0000000..97d579e --- /dev/null +++ b/test/performance/fetch_executor_matrix.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../../lib", __dir__) + +require "bundler/setup" +require "crawlscope" +require "json" +require "socket" + +class MatrixHttpServer + attr_reader :base_url + + def initialize(page_count:, delay_seconds:, link_targets: false) + @page_count = page_count + @delay_seconds = delay_seconds + @link_targets = link_targets + @server = TCPServer.new("127.0.0.1", 0) + @base_url = "http://127.0.0.1:#{@server.addr[1]}" + @threads = [] + end + + def start + @thread = Thread.new do + loop do + socket = @server.accept + @threads << Thread.new(socket) { |client| respond(client) } + rescue IOError + break + end + end + end + + def stop + @server.close + @thread&.join + @threads.each(&:join) + end + + private + + def respond(socket) + request_line = socket.gets.to_s + path = request_line.split[1].to_s + read_headers(socket) + + if path == "/sitemap.xml" + write_response(socket, sitemap_xml, content_type: "application/xml") + else + sleep @delay_seconds + write_response(socket, page_html(path), content_type: "text/html") + end + ensure + socket.close + end + + def read_headers(socket) + loop do + line = socket.gets + break if line.nil? || line == "\r\n" + end + end + + def sitemap_xml + paths = @link_targets ? ["/seed"] : (1..@page_count).map { |index| "/pages/#{index}" } + urls = paths.map { |path| "#{@base_url}#{path}" }.join + + %(#{urls}) + end + + def page_html(path) + links = if @link_targets && path == "/seed" + (1..@page_count).map { |index| %(Target #{index}) }.join + else + "" + end + + <<~HTML + + + #{path} + + + +

#{path}

#{path} benchmark page

#{links}
+ + + HTML + end + + def write_response(socket, body, content_type:) + socket.write "HTTP/1.1 200 OK\r\n" + socket.write "Content-Type: #{content_type}\r\n" + socket.write "Content-Length: #{body.bytesize}\r\n" + socket.write "Connection: close\r\n" + socket.write "\r\n" + socket.write body + end +end + +def measure(base_url:, concurrency:, fetch_executor:, rules:) + started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + result = Crawlscope::Crawl.new( + base_url: base_url, + sitemap_path: "#{base_url}/sitemap.xml", + rules: rules, + schema_registry: Crawlscope::SchemaRegistry.default, + concurrency: concurrency, + fetch_executor: fetch_executor + ).call + + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at + {seconds: elapsed, pages: result.pages.size, issues: result.issues.size} +end + +def median(values) + sorted = values.sort + sorted[sorted.length / 2] +end + +def run_case(name:, page_count:, delay_seconds:, concurrency:, link_targets: false) + server = MatrixHttpServer.new(page_count: page_count, delay_seconds: delay_seconds, link_targets: link_targets) + server.start + + rules = link_targets ? [Crawlscope::Rules::Links.new] : [] + threaded = [] + async = [] + + 3.times do + threaded << measure(base_url: server.base_url, concurrency: concurrency, fetch_executor: :threaded, rules: rules) + async << measure(base_url: server.base_url, concurrency: concurrency, fetch_executor: :async, rules: rules) + end + + threaded_seconds = median(threaded.map { |result| result.fetch(:seconds) }) + async_seconds = median(async.map { |result| result.fetch(:seconds) }) + + { + name: name, + page_count: page_count, + delay_seconds: delay_seconds, + concurrency: concurrency, + link_targets: link_targets, + threaded_seconds: threaded_seconds.round(3), + async_seconds: async_seconds.round(3), + async_vs_threaded: (threaded_seconds / async_seconds).round(2), + pages: async.first.fetch(:pages), + issues: async.first.fetch(:issues) + } +ensure + server&.stop +end + +cases = [ + {name: "direct_pages_c8", page_count: 48, delay_seconds: 0.02, concurrency: 8}, + {name: "direct_pages_c16", page_count: 48, delay_seconds: 0.02, concurrency: 16}, + {name: "slow_direct_pages_c8", page_count: 48, delay_seconds: 0.08, concurrency: 8}, + {name: "slow_direct_pages_c16", page_count: 48, delay_seconds: 0.08, concurrency: 16}, + {name: "link_targets_c8", page_count: 48, delay_seconds: 0.02, concurrency: 8, link_targets: true}, + {name: "slow_link_targets_c8", page_count: 48, delay_seconds: 0.08, concurrency: 8, link_targets: true} +] + +puts JSON.pretty_generate(cases.map { |attributes| run_case(**attributes) }) diff --git a/test/performance/sitemap_expansion_benchmark.rb b/test/performance/sitemap_expansion_benchmark.rb new file mode 100644 index 0000000..183534f --- /dev/null +++ b/test/performance/sitemap_expansion_benchmark.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../../lib", __dir__) + +require "bundler/setup" +require "crawlscope" +require "json" +require "socket" + +class DelayedSitemapServer + attr_reader :base_url + + def initialize(child_count:, delay_seconds:) + @child_count = child_count + @delay_seconds = delay_seconds + @server = TCPServer.new("127.0.0.1", 0) + @base_url = "http://127.0.0.1:#{@server.addr[1]}" + @threads = [] + end + + def start + @thread = Thread.new do + loop do + socket = @server.accept + @threads << Thread.new(socket) { |client| respond(client) } + rescue IOError + break + end + end + end + + def stop + @server.close + @thread&.join + @threads.each(&:join) + end + + private + + def respond(socket) + request_line = socket.gets.to_s + path = request_line.split[1].to_s + read_headers(socket) + + if path == "/sitemap.xml" + write_response(socket, sitemap_index, content_type: "application/xml") + else + sleep @delay_seconds + write_response(socket, child_sitemap(path), content_type: "application/xml") + end + ensure + socket.close + end + + def read_headers(socket) + loop do + line = socket.gets + break if line.nil? || line == "\r\n" + end + end + + def sitemap_index + children = (1..@child_count).map do |index| + "#{@base_url}/sitemaps/#{index}.xml" + end.join + + %(#{children}) + end + + def child_sitemap(path) + index = File.basename(path, ".xml") + + %(#{@base_url}/pages/#{index}) + end + + def write_response(socket, body, content_type:) + socket.write "HTTP/1.1 200 OK\r\n" + socket.write "Content-Type: #{content_type}\r\n" + socket.write "Content-Length: #{body.bytesize}\r\n" + socket.write "Connection: close\r\n" + socket.write "\r\n" + socket.write body + end +end + +def measure(name, base_url:, concurrency:, fetch_executor:) + started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + urls = Crawlscope::Sitemap.new( + path: "#{base_url}/sitemap.xml", + concurrency: concurrency, + fetch_executor: fetch_executor, + timeout_seconds: 5 + ).urls(base_url: base_url) + + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at + [name, {seconds: elapsed.round(3), urls: urls.size}] +end + +server = DelayedSitemapServer.new(child_count: 24, delay_seconds: 0.08) +server.start + +begin + results = {} + [ + measure("threaded_concurrency_1", base_url: server.base_url, concurrency: 1, fetch_executor: :threaded), + measure("threaded_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :threaded), + measure("async_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :async) + ].each { |name, result| results[name] = result } + + sequential = results.fetch("threaded_concurrency_1").fetch(:seconds) + threaded = results.fetch("threaded_concurrency_8").fetch(:seconds) + async = results.fetch("async_concurrency_8").fetch(:seconds) + + abort "sitemap benchmark failed: threaded parallelism was not at least 2x faster" unless threaded < sequential * 0.5 + abort "sitemap benchmark failed: async parallelism was not at least 2x faster" unless async < sequential * 0.5 + + puts JSON.pretty_generate(results) +ensure + server.stop +end From 1d814fb3b28113f357467837a5c983fef73d78f8 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Mon, 1 Jun 2026 23:14:50 +0100 Subject: [PATCH 2/3] refactor: default HTTP crawling to async Make async the default executor for HTTP crawls while preserving threaded execution for browser rendering and explicit FETCH_EXECUTOR=threaded overrides. Centralize bounded ordered mapping under FetchExecutor.map and remove duplicated rule-level parallel_map helpers from links and uniqueness checks. Verification: bundle exec rake test; bundle exec rake standard; mise exec ruby@3.4.8 -- bundle exec rake test; mise exec ruby@3.4.8 -- bundle exec rake standard; bundle exec rake crawlscope:validate URL=http://localhost:3000 --- README.md | 9 ++--- UPGRADE.md | 9 ++--- async-performance-assessment.md | 5 +-- lib/crawlscope/cli.rb | 9 ++++- lib/crawlscope/configuration.rb | 6 ++-- lib/crawlscope/crawl.rb | 8 +++-- lib/crawlscope/fetch_executor.rb | 7 ++++ lib/crawlscope/fetch_executor/async.rb | 12 +++---- lib/crawlscope/fetch_executor/threaded.rb | 14 ++++---- lib/crawlscope/rules/links.rb | 12 +++---- lib/crawlscope/rules/uniqueness.rb | 12 +++---- perf-improvement.md | 5 +-- test/crawlscope/cli_test.rb | 13 ++++++- test/crawlscope/configuration_test.rb | 9 ++++- test/crawlscope/crawl_test.rb | 32 +++++++++++++++-- test/crawlscope/fetch_executor_test.rb | 44 +++++++++++++++++++++++ 16 files changed, 157 insertions(+), 49 deletions(-) create mode 100644 test/crawlscope/fetch_executor_test.rb diff --git a/README.md b/README.md index c338f15..2566bbd 100644 --- a/README.md +++ b/README.md @@ -199,10 +199,11 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]' Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the task-argument form above instead. -`FETCH_EXECUTOR=threaded` is the default and works with Crawlscope's required -runtime dependencies. `FETCH_EXECUTOR=async` uses Ruby's fiber scheduler and -Async::HTTP through Faraday, preserving the same `CONCURRENCY` bound. The CLI -accepts the same setting with `--fetch-executor async`. +`FETCH_EXECUTOR=async` is the default for HTTP crawling. It uses Ruby's fiber +scheduler and Async::HTTP through Faraday, preserving the same `CONCURRENCY` +bound. Use `FETCH_EXECUTOR=threaded` or `--fetch-executor threaded` for the +thread-pool executor. Browser rendering uses the threaded executor by default +because async fetch execution is only supported with HTTP rendering. `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`. diff --git a/UPGRADE.md b/UPGRADE.md index bbc6f6a..f3c9415 100644 --- a/UPGRADE.md +++ b/UPGRADE.md @@ -15,7 +15,8 @@ Recommended migration: 1. Upgrade the host application runtime to Ruby 3.3 or newer. 2. Run `bundle update crawlscope async async-http async-http-faraday`. -3. Keep the default `FETCH_EXECUTOR=threaded` for the first deploy if you want - a conservative rollout. -4. Enable async fetching with `FETCH_EXECUTOR=async` or - `--fetch-executor async` after the app is running on the new Ruby version. +3. Crawlscope now uses `FETCH_EXECUTOR=async` by default for HTTP crawling. +4. Set `FETCH_EXECUTOR=threaded` or pass `--fetch-executor threaded` for a + conservative rollout or for explicit thread-pool execution. +5. Browser rendering continues to use threaded execution by default because + async fetch execution is only supported with HTTP rendering. diff --git a/async-performance-assessment.md b/async-performance-assessment.md index e38b1a7..ef7c30c 100644 --- a/async-performance-assessment.md +++ b/async-performance-assessment.md @@ -149,8 +149,9 @@ Position it as: - potentially better at higher concurrency with lower thread pressure, - equivalent-to-slightly-faster than threaded for the measured local workloads. -If the product goal is a reliable 2x improvement over current default threaded -crawls, the next performance work should not be "more async." It should target: +If the product goal is a reliable 2x improvement over the previous threaded +baseline, the next performance work should not be "more async." It should +target: 1. persistent fetch/result caching across rule phases, 2. optional higher concurrency with per-host rate limits, diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb index dbd2ec4..638db9a 100644 --- a/lib/crawlscope/cli.rb +++ b/lib/crawlscope/cli.rb @@ -134,6 +134,7 @@ def run_validate configure_renderer(resolved_renderer) @configuration.concurrency = resolved_concurrency + fetch_executor_configured = !normalized_string(ENV["FETCH_EXECUTOR"]).nil? @configuration.fetch_executor = resolved_fetch_executor @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1) @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1) @@ -170,11 +171,13 @@ def run_validate end opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value| + fetch_executor_configured = true @configuration.fetch_executor = value end end parser.parse!(@argv) + @configuration.fetch_executor = :threaded if @configuration.renderer == :browser && !fetch_executor_configured result = task.validate( base_url: options[:url], @@ -227,7 +230,11 @@ def resolved_concurrency end def resolved_fetch_executor - normalized_string(ENV["FETCH_EXECUTOR"]) || @configuration.fetch_executor + configured_executor = normalized_string(ENV["FETCH_EXECUTOR"]) + return configured_executor if configured_executor + return :threaded if @configuration.renderer == :browser + + @configuration.fetch_executor end def resolved_integer(name, default:, minimum:) diff --git a/lib/crawlscope/configuration.rb b/lib/crawlscope/configuration.rb index afb3e8b..76df814 100644 --- a/lib/crawlscope/configuration.rb +++ b/lib/crawlscope/configuration.rb @@ -7,7 +7,7 @@ class Configuration DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5 DEFAULT_BROWSER_SCROLL_PAGE = true DEFAULT_CONCURRENCY = 10 - DEFAULT_FETCH_EXECUTOR = :threaded + DEFAULT_FETCH_EXECUTOR = :async RENDERERS = %i[http browser].freeze DEFAULT_TIMEOUT_SECONDS = 20 @@ -33,7 +33,9 @@ def concurrency def fetch_executor value = resolve(@fetch_executor) - FetchExecutor.normalize(value.nil? ? DEFAULT_FETCH_EXECUTOR : value) + default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR + + FetchExecutor.normalize(value.nil? ? default : value) end def browser_concurrency diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index 26fd1da..4bb42dc 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -2,16 +2,16 @@ module Crawlscope class Crawl - def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES) + def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: nil, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES) @base_url = base_url @sitemap_path = sitemap_path @rules = Array(rules) @schema_registry = schema_registry @browser_factory = browser_factory @concurrency = concurrency - @fetch_executor = fetch_executor @network_idle_timeout_seconds = network_idle_timeout_seconds @renderer = renderer.to_sym + @fetch_executor = fetch_executor || default_fetch_executor @scroll_page = scroll_page @timeout_seconds = timeout_seconds @allowed_statuses = allowed_statuses @@ -88,6 +88,10 @@ def validate_fetch_executor! raise ConfigurationError, "Async fetch execution is only supported with http rendering" end + def default_fetch_executor + (@renderer == :browser) ? :threaded : Configuration::DEFAULT_FETCH_EXECUTOR + end + def context Context.new( allowed_statuses: @allowed_statuses, diff --git a/lib/crawlscope/fetch_executor.rb b/lib/crawlscope/fetch_executor.rb index 50b2980..c462a71 100644 --- a/lib/crawlscope/fetch_executor.rb +++ b/lib/crawlscope/fetch_executor.rb @@ -17,6 +17,13 @@ def build(name:, concurrency:) end end + def map(name:, concurrency:, items:, &block) + items = Array(items) + return items.map(&block) if items.size < 2 || concurrency.to_i <= 1 + + build(name: name, concurrency: concurrency).call(items, &block) + end + def normalize(name) return name if name.respond_to?(:call) diff --git a/lib/crawlscope/fetch_executor/async.rb b/lib/crawlscope/fetch_executor/async.rb index fc7cfdb..5bdf563 100644 --- a/lib/crawlscope/fetch_executor/async.rb +++ b/lib/crawlscope/fetch_executor/async.rb @@ -10,22 +10,22 @@ def initialize(concurrency:) @concurrency = concurrency end - def call(urls) - indexed_urls = Array(urls).each_with_index.to_a - pages = Array.new(indexed_urls.size) + def call(items) + indexed_items = Array(items).each_with_index.to_a + results = Array.new(indexed_items.size) Sync do |parent| semaphore = ::Async::Semaphore.new(@concurrency) - tasks = indexed_urls.map do |url, index| + tasks = indexed_items.map do |item, index| semaphore.async(parent: parent) do - pages[index] = yield(url) + results[index] = yield(item) end end tasks.each(&:wait) end - pages + results end end end diff --git a/lib/crawlscope/fetch_executor/threaded.rb b/lib/crawlscope/fetch_executor/threaded.rb index e9410f8..d819104 100644 --- a/lib/crawlscope/fetch_executor/threaded.rb +++ b/lib/crawlscope/fetch_executor/threaded.rb @@ -9,23 +9,23 @@ def initialize(concurrency:) @concurrency = concurrency end - def call(urls) - indexed_urls = Array(urls).each_with_index.to_a - pages = Array.new(indexed_urls.size) + def call(items) + indexed_items = Array(items).each_with_index.to_a + results = Array.new(indexed_items.size) mutex = Mutex.new pool = Concurrent::FixedThreadPool.new(@concurrency) - indexed_urls.each do |url, index| + indexed_items.each do |item, index| pool.post do - page = yield(url) - mutex.synchronize { pages[index] = page } + result = yield(item) + mutex.synchronize { results[index] = result } end end pool.shutdown pool.wait_for_termination - pages + results end end end diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index d725853..bc14b78 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -48,7 +48,11 @@ def contextual_links(doc) def extract_links(pages) html_pages = pages.select(&:html?) - parallel_map(html_pages) { |page| page_links(page) }.flatten + FetchExecutor.map( + name: @fetch_executor, + concurrency: @concurrency, + items: html_pages + ) { |page| page_links(page) }.flatten end def page_links(page) @@ -629,12 +633,6 @@ def context_value(context, name, default: nil) default end end - - def parallel_map(items, &block) - return items.map(&block) if items.size < 2 || @fetch_executor.nil? || @concurrency.to_i <= 1 - - FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency).call(items, &block) - end end end end diff --git a/lib/crawlscope/rules/uniqueness.rb b/lib/crawlscope/rules/uniqueness.rb index baa43a6..3a3dc07 100644 --- a/lib/crawlscope/rules/uniqueness.rb +++ b/lib/crawlscope/rules/uniqueness.rb @@ -68,7 +68,11 @@ def summary_for(page) def summarize_pages(pages) html_pages = pages.select(&:html?) - parallel_map(html_pages) { |page| summary_for(page) } + FetchExecutor.map( + name: @fetch_executor, + concurrency: @concurrency, + items: html_pages + ) { |page| summary_for(page) } end def validate_duplicates(page_summaries, issues) @@ -192,12 +196,6 @@ def context_value(context, name, default: nil) default end end - - def parallel_map(items, &block) - return items.map(&block) if items.size < 2 || @fetch_executor.nil? || @concurrency.to_i <= 1 - - FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency).call(items, &block) - end end end end diff --git a/perf-improvement.md b/perf-improvement.md index 6b7dd0d..1b5482f 100644 --- a/perf-improvement.md +++ b/perf-improvement.md @@ -26,8 +26,9 @@ Defer Plan 3 until benchmarks prove the graph-level complexity is needed. ## Implementation Update Plan 1 has been implemented with the deliberate Ruby `>= 3.3` contract change. -The default executor remains threaded, and `fetch_executor: :async` is now -available for HTTP crawls through `async-http-faraday`. +The default executor is now async for HTTP crawls through +`async-http-faraday`, with `fetch_executor: :threaded` retained as an explicit +fallback. The safe follow-on parallelization work is also implemented: diff --git a/test/crawlscope/cli_test.rb b/test/crawlscope/cli_test.rb index a91a92a..1a33968 100644 --- a/test/crawlscope/cli_test.rb +++ b/test/crawlscope/cli_test.rb @@ -9,7 +9,7 @@ class FakeConfiguration def initialize @base_url = nil @concurrency = 10 - @fetch_executor = :threaded + @fetch_executor = :async @network_idle_timeout_seconds = 5 @renderer = :http @timeout_seconds = 20 @@ -134,6 +134,17 @@ def test_validate_reads_fetch_executor_from_environment assert_equal "async", configuration.fetch_executor end + def test_validate_uses_threaded_executor_for_browser_rendering_by_default + configuration = FakeConfiguration.new + task = FakeTask.new + + status = Crawlscope::Cli.start(["validate", "--url", "https://example.com", "--renderer", "browser"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task) + + assert_equal 0, status + assert_equal :browser, configuration.renderer + assert_equal :threaded, configuration.fetch_executor + end + def test_ldjson_reads_urls_from_environment configuration = FakeConfiguration.new task = FakeTask.new diff --git a/test/crawlscope/configuration_test.rb b/test/crawlscope/configuration_test.rb index 2b5efa2..48d490c 100644 --- a/test/crawlscope/configuration_test.rb +++ b/test/crawlscope/configuration_test.rb @@ -57,7 +57,7 @@ def test_defaults_are_normalized assert_equal [200, 301, 302], config.allowed_statuses assert_equal 10, config.concurrency - assert_equal :threaded, config.fetch_executor + assert_equal :async, config.fetch_executor assert_equal 4, config.browser_concurrency assert_equal 5, config.network_idle_timeout_seconds assert_equal :http, config.renderer @@ -66,6 +66,13 @@ def test_defaults_are_normalized assert config.scroll_page? end + def test_browser_renderer_defaults_to_threaded_fetch_executor + config = Crawlscope::Configuration.new + config.renderer = :browser + + assert_equal :threaded, config.fetch_executor + end + def test_configured_values_are_normalized config = Crawlscope::Configuration.new config.allowed_statuses = ["200", "404"] diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb index 8b94255..353cebd 100644 --- a/test/crawlscope/crawl_test.rb +++ b/test/crawlscope/crawl_test.rb @@ -42,6 +42,29 @@ def teardown FileUtils.rm_rf(@tmp_dir) end + def test_http_renderer_defaults_to_async_executor + crawl = Crawlscope::Crawl.new( + base_url: "https://example.com", + sitemap_path: @sitemap_path, + rules: [], + schema_registry: Crawlscope::SchemaRegistry.default + ) + + assert_equal :async, crawl.instance_variable_get(:@fetch_executor) + end + + def test_browser_renderer_defaults_to_threaded_executor + crawl = Crawlscope::Crawl.new( + base_url: "https://example.com", + sitemap_path: @sitemap_path, + rules: [], + schema_registry: Crawlscope::SchemaRegistry.default, + renderer: :browser + ) + + assert_equal :threaded, crawl.instance_variable_get(:@fetch_executor) + end + def test_returns_ok_when_metadata_is_valid File.write( @sitemap_path, @@ -86,7 +109,8 @@ def test_returns_ok_when_metadata_is_valid base_url: "https://example.com", sitemap_path: @sitemap_path, rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules, - schema_registry: Crawlscope::SchemaRegistry.default + schema_registry: Crawlscope::SchemaRegistry.default, + fetch_executor: :threaded ).call assert result.ok? @@ -127,7 +151,8 @@ def test_collects_metadata_issues_for_invalid_page base_url: "https://example.com", sitemap_path: @sitemap_path, rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules, - schema_registry: Crawlscope::SchemaRegistry.default + schema_registry: Crawlscope::SchemaRegistry.default, + fetch_executor: :threaded ).call assert result.ok? @@ -264,7 +289,8 @@ def test_reports_sitemap_redirect_url base_url: "https://example.com", sitemap_path: @sitemap_path, rules: [], - schema_registry: Crawlscope::SchemaRegistry.default + schema_registry: Crawlscope::SchemaRegistry.default, + fetch_executor: :threaded ).call assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url diff --git a/test/crawlscope/fetch_executor_test.rb b/test/crawlscope/fetch_executor_test.rb new file mode 100644 index 0000000..65873e4 --- /dev/null +++ b/test/crawlscope/fetch_executor_test.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require "test_helper" + +class CrawlscopeFetchExecutorTest < Minitest::Test + class RecordingExecutor + attr_reader :items + + def call(items) + @items = items + items.map { |item| yield(item) } + end + end + + def test_map_preserves_input_order + results = Crawlscope::FetchExecutor.map(name: :threaded, concurrency: 2, items: [3, 1, 2]) do |item| + item * 10 + end + + assert_equal [30, 10, 20], results + end + + def test_map_uses_sequential_fallback_for_single_item + executor = RecordingExecutor.new + + results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: ["one"]) do |item| + item.upcase + end + + assert_equal ["ONE"], results + assert_nil executor.items + end + + def test_map_uses_injected_executor_for_parallel_work + executor = RecordingExecutor.new + + results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: %w[a b]) do |item| + item.upcase + end + + assert_equal %w[A B], results + assert_equal %w[a b], executor.items + end +end From 20a59ab59c4853229c5beb89c8be8f2f72e4ace3 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Mon, 1 Jun 2026 23:20:51 +0100 Subject: [PATCH 3/3] chore: update Ruby CI matrix Drop Ruby 3.2 from the CI matrix now that Crawlscope requires Ruby 3.3 or newer for the async runtime dependencies. --- .github/workflows/ruby.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 37ca9fe..1ec5062 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - ruby-version: ["3.2", "3.4"] + ruby-version: ["3.3", "3.4", "4.0.3"] steps: - uses: actions/checkout@v4