ethos-link · fidalgo · May 31, 2026 · May 31, 2026
diff --git a/README.md b/README.md
@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
 
 ```bash
 bundle exec rake crawlscope:validate URL=https://example.com
+bundle exec rake 'crawlscope:validate[https://example.com]'
 bundle exec rake crawlscope:validate:metadata URL=https://example.com
+bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
 bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
+bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
 ```
 
 `crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
 
+Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
+task-argument form above instead.
+
 `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 
 ### Structured Data URL Audit

diff --git a/Rakefile b/Rakefile
@@ -183,35 +183,35 @@ namespace :release do
 end
 
 namespace :crawlscope do
-  desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
-  task :validate do
-    Crawlscope::RakeTasks.validate
+  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  task :validate, [:url, :sitemap, :rules] do |_task, args|
+    Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
   end
 
   namespace :validate do
-    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
-    task :ldjson do
-      Crawlscope::RakeTasks.ldjson
+    desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    task :ldjson, [:url] do |_task, args|
+      Crawlscope::RakeTasks.ldjson(urls: args[:url])
     end
 
-    desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
-    task :metadata do
-      Crawlscope::RakeTasks.validate_rule("metadata")
+    desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :metadata, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
-    task :structured_data do
-      Crawlscope::RakeTasks.validate_rule("structured_data")
+    desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :structured_data, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
-    task :uniqueness do
-      Crawlscope::RakeTasks.validate_rule("uniqueness")
+    desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :uniqueness, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
-    task :links do
-      Crawlscope::RakeTasks.validate_rule("links")
+    desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :links, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
     end
   end
 end

diff --git a/docs/ahrefs-check-candidates.md b/docs/ahrefs-check-candidates.md
@@ -0,0 +1,236 @@
+# Ahrefs Issue Crosswalk
+
+This crosswalk compares the Ahrefs issue list against Crawlscope's current
+rules and identifies checks that fit the gem's sitemap-driven, internal-audit
+scope.
+
+## Current Coverage
+
+- `noindex_meta`, `noindex_header`: covers "Noindex page" via meta robots and
+  `X-Robots-Tag`.
+- `redirected_page`: covers crawled sitemap URLs that redirect.
+- `internal_link_redirects`: covers "Page has links to redirect".
+- `low_inbound_anchor_links`: partly covers "Page has only one dofollow
+  incoming internal link", but currently does not distinguish dofollow from
+  nofollow.
+- `title_too_long`: covers "Title too long".
+- `meta_description_too_short`, `meta_description_too_long`: covers meta
+  description length checks.
+- `missing_title`, `missing_meta_description`, `missing_h1`, `multiple_h1`,
+  `missing_canonical`, `canonical_mismatch`, and
+  `incomplete_open_graph_tags`: covers adjacent metadata checks not present in
+  the pasted Ahrefs list.
+- `structured_data_parse_error`, `structured_data_schema_error`: covers
+  schema.org-style validation errors.
+- `missing_structured_data`, `missing_job_posting`, `multiple_job_postings`:
+  covers structured-data presence and domain-specific JobPosting rules.
+- `duplicate_title`, `duplicate_meta_description`,
+  `duplicate_content_fingerprint`, `near_duplicate_content`: covers uniqueness
+  checks beyond the pasted Ahrefs list.
+- `thin_visible_text`, `low_visible_text_ratio`, `low_unique_token_ratio`:
+  covers content-quality checks beyond the pasted Ahrefs list.
+- `unexpected_status`, `fetch_failed`, `broken_internal_link`: covers status
+  and broken-link checks beyond the pasted Ahrefs list.
+
+## Candidate Checks To Add
+
+### High Fit
+
+1. `nofollow_meta`
+   - Ahrefs match: "Nofollow page".
+   - Detect `nofollow` in meta robots and `X-Robots-Tag`.
+   - This should share parsing with the existing noindex checks so
+     `noindex,nofollow`, `none`, and scoped directives are handled consistently.
+
+2. `noindex_follow_meta` / `noindex_follow_header`
+   - Ahrefs match: "Noindex follow page".
+   - Detect pages that explicitly combine `noindex` with `follow`.
+   - This is useful because current Crawlscope reports noindex but does not
+     classify whether internal links remain followable.
+
+3. `noindex_nofollow_meta` / `noindex_nofollow_header`
+   - Ahrefs match: "Noindex and nofollow page".
+   - Detect pages that block indexing and link following.
+   - Could also treat `none` as equivalent to `noindex,nofollow`.
+
+4. `canonical_no_internal_inlinks`
+   - Ahrefs match: "Canonical URL has no incoming internal links".
+   - For each sitemap URL, resolve its canonical target and count internal
+     links to the canonical URL/path.
+   - This is more precise than `low_inbound_anchor_links`, which counts links
+     to the crawled URL/final path rather than canonical target.
+
+5. `nofollow_internal_outlinks`
+   - Ahrefs match: "Page has nofollow outgoing internal links".
+   - Extend link extraction to retain `rel` attributes and flag internal
+     anchors with `rel~="nofollow"`.
+   - This also unlocks incoming nofollow/dofollow mix checks.
+
+6. `only_nofollow_internal_inlinks`
+   - Ahrefs match: "Page has nofollow incoming internal links only".
+   - Count incoming internal links by follow state.
+   - Report when a sitemap URL has at least one internal inlink but zero
+     dofollow internal inlinks.
+
+7. `mixed_follow_internal_inlinks`
+   - Ahrefs match: "Page has nofollow and dofollow incoming internal links".
+   - Count incoming internal links by follow state.
+   - Report when both counts are positive, with source samples for each class.
+
+8. `low_dofollow_inlinks`
+   - Ahrefs match: "Page has only one dofollow incoming internal link".
+   - Replace or supplement `low_inbound_anchor_links` with a dofollow-specific
+     count.
+   - Make the threshold configurable; Ahrefs surfaces "only one", but a host
+     app may want `minimum_dofollow_inlinks = 2`.
+
+9. `indexable_page_missing_from_sitemap`
+   - Ahrefs match: "Indexable page not in sitemap".
+   - Use discovered internal links to find crawlable, indexable HTML pages that
+     are not present in the sitemap.
+   - This requires Crawlscope to keep discovered internal targets, not only
+     sitemap URLs, so it is a larger links/crawl contract change.
+
+10. `sitemap_noindex_url`
+    - Ahrefs adjacent documented issue: "Noindex page in sitemap".
+    - Current noindex checks run on sitemap pages, but this issue name would
+      make the sitemap/indexability conflict explicit.
+    - It can be implemented as a second issue emitted from the indexability
+      rule, or as a dedicated sitemap consistency rule.
+
+11. `sitemap_redirect_url`
+    - Ahrefs match: "3XX redirect" and adjacent "3xx redirect in sitemap".
+    - Current `redirected_page` covers this behavior; adding this alias/code
+      would make reports match external audit tools more directly.
+
+12. `http_internal_link`
+    - Ahrefs adjacent issue: "HTTPS page has internal links to HTTP".
+    - Detect internal links from HTTPS pages to HTTP URLs on the same host.
+    - This is more actionable than relying on redirect detection after fetch.
+
+13. `canonical_points_to_redirect`
+    - Ahrefs adjacent issue: "Canonical points to redirect".
+    - Resolve canonical targets and flag canonical URLs that redirect.
+    - This fits the existing canonical metadata rule but needs target
+      resolution similar to the links rule.
+
+14. `canonical_points_to_error`
+    - Ahrefs adjacent issues: "Canonical points to 4XX" and "Canonical points
+      to 5XX".
+    - Resolve canonical targets and flag non-success responses.
+
+### Medium Fit
+
+15. `multiple_title_tags`
+    - Ahrefs documented issue: "Multiple title tags".
+    - Current metadata rule checks missing/long/repeated title, but not
+      multiple `<title>` elements.
+
+16. `multiple_meta_descriptions`
+    - Ahrefs documented issue: "Multiple meta description tags".
+    - Current metadata rule checks missing/short/long description, but not
+      duplicate description tags on the same page.
+
+17. `empty_h1`
+    - Ahrefs documented issue: "H1 tag missing or empty".
+    - Current `missing_h1` only checks the element count. An empty `<h1>` should
+      be reported separately or treated as missing.
+
+18. `page_has_no_outgoing_links`
+    - Ahrefs documented issue: "Page has no outgoing links".
+    - Detect indexable HTML pages with zero meaningful outgoing anchors.
+    - The rule should ignore skipped Rails/CDN/mail/tel links consistently with
+      the current links rule.
+
+19. `orphan_page`
+    - Ahrefs documented issue: "Orphan page".
+    - Similar to `canonical_no_internal_inlinks`, but for page URL rather than
+      final canonical URL.
+    - This overlaps with a configurable `low_dofollow_inlinks` threshold of
+      one, so avoid double-reporting.
+
+20. `non_canonical_page_in_sitemap`
+    - Ahrefs documented issue: "Non-canonical page in sitemap".
+    - Current `canonical_mismatch` flags the page-level mismatch; this new code
+      would explicitly state the sitemap contract violation.
+
+21. `duplicate_pages_without_canonical`
+    - Ahrefs documented issue: "Duplicate pages without canonical".
+    - Current uniqueness rules find duplicate content/title/description. This
+      would connect duplicate clusters to canonical presence and consistency.
+
+22. `url_double_slash`
+    - Ahrefs documented issue: "Double slash in URL".
+    - Detect sitemap or internal-link URLs whose path contains accidental
+      double slashes.
+    - Low implementation cost, but lower impact than link/indexability checks.
+
+23. `url_too_long`
+    - Ahrefs Page Explorer exposes URL length.
+    - Useful as a notice-level metadata/url hygiene check.
+
+24. `structured_data_missing_type`
+    - Ahrefs structured-data docs call out missing `@type`.
+    - Current schema validation may catch this for registered schemas, but a
+      generic issue code would make raw schema.org failures easier to act on.
+
+25. `structured_data_invalid_type`
+    - Ahrefs structured-data docs call out invalid schema types.
+    - Could be implemented only if Crawlscope adopts or vendors a schema.org
+      vocabulary source; otherwise keep using configured JSON schemas.
+
+26. `structured_data_invalid_property`
+    - Ahrefs structured-data docs call out invalid schema properties.
+    - Same caveat as invalid type: this needs schema.org vocabulary validation,
+      not only local JSON schemas.
+
+### Lower Fit Or Requires External Data
+
+27. `noindex_page_became_indexable`
+    - Ahrefs match: "Noindex page became indexable".
+    - Requires historical crawl snapshots. Not a good fit until Crawlscope has
+      persistence or report comparison.
+
+28. `h1_changed`, `meta_description_changed`, `title_tag_changed`,
+    `serp_title_changed`
+    - Ahrefs match: changed-content issues.
+    - Requires prior crawl snapshots or external SERP data. Not a current
+      stateless gem fit.
+
+29. `page_and_serp_titles_do_not_match`
+    - Ahrefs match: SERP title mismatch.
+    - Requires search-result data. Out of scope for a deterministic sitemap
+      crawler unless a host app injects SERP observations.
+
+30. `pages_added_to_sitemaps`
+    - Ahrefs match: "Pages added to sitemaps".
+    - Requires comparing current sitemap URLs with a prior crawl.
+
+31. `pages_to_submit_to_indexnow`
+    - Ahrefs match: "Pages to submit to IndexNow".
+    - Requires change detection and an IndexNow integration. Better as a host
+      app workflow than a default Crawlscope rule.
+
+32. `google_rich_results_validation_error`
+    - Ahrefs match: "Structured data has Google rich results validation error".
+    - Crawlscope can validate local schema contracts today, but Google rich
+      result validation requires Google-specific rule coverage and may diverge
+      from schema.org validation. Add only if the gem owns those feature
+      schemas explicitly.
+
+## Recommended First Batch
+
+1. Add a shared robots directive parser and emit `nofollow_meta`,
+   `noindex_follow_*`, and `noindex_nofollow_*`.
+2. Extend link extraction with follow state and emit `nofollow_internal_outlinks`,
+   `only_nofollow_internal_inlinks`, `mixed_follow_internal_inlinks`, and
+   `low_dofollow_inlinks`.
+3. Add canonical target checks:
+   `canonical_no_internal_inlinks`, `canonical_points_to_redirect`, and
+   `canonical_points_to_error`.
+4. Add sitemap consistency aliases for already-observed conditions:
+   `sitemap_noindex_url`, `sitemap_redirect_url`, and
+   `non_canonical_page_in_sitemap`.
+5. Add simple metadata hygiene checks:
+   `multiple_title_tags`, `multiple_meta_descriptions`, and `empty_h1`.
+
diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb
@@ -37,11 +37,14 @@ def call
         @err.puts(general_usage)
         1
       end
-    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
+    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
       @err.puts(error.message)
       @err.puts("")
       @err.puts(general_usage)
       1
+    rescue ValidationError => error
+      @err.puts(error.message)
+      1
     end
 
     private

diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
@@ -83,6 +83,7 @@ def collect(pages, issues)
           issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
         elsif redirected?(page)
           issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
+          issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
         end
       end
     end
@@ -127,6 +128,7 @@ def resolution(page, normalized_url, crawled:)
         crawled: crawled,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        html: page.html?,
         status: page.status
       }
     end

diff --git a/lib/crawlscope/rake_tasks.rb b/lib/crawlscope/rake_tasks.rb
@@ -4,25 +4,40 @@ module Crawlscope
   module RakeTasks
     module_function
 
-    def validate
-      run("validate")
+    def validate(url: nil, sitemap_path: nil, rule_names: nil)
+      run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
     end
 
-    def ldjson
-      run("ldjson")
+    def ldjson(urls: nil)
+      run("ldjson", argv: ldjson_argv(urls: urls))
     end
 
-    def validate_rule(rule)
-      original_rules = ENV["RULES"]
-      ENV["RULES"] = rule
-      validate
-    ensure
-      ENV["RULES"] = original_rules
+    def validate_rule(rule, url: nil, sitemap_path: nil)
+      validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
     end
 
-    def run(command)
-      status = Cli.start([command], out: $stdout, err: $stderr)
+    def run(command, argv: [])
+      status = Cli.start([command, *argv], out: $stdout, err: $stderr)
       exit(status) unless status.zero?
     end
+
+    def validate_argv(url:, sitemap_path:, rule_names:)
+      [
+        option_pair("--url", url),
+        option_pair("--sitemap", sitemap_path),
+        option_pair("--rules", rule_names)
+      ].compact.flatten
+    end
+
+    def ldjson_argv(urls:)
+      Array(urls).flat_map { |url| option_pair("--url", url) }.compact
+    end
+
+    def option_pair(name, value)
+      value = value.to_s.strip
+      return if value.empty?
+
+      [name, value]
+    end
   end
 end