From 75a20b4841276ddd9ac3256437b6921f8fdbb7e1 Mon Sep 17 00:00:00 2001
From: Paulo Fidalgo <paulo.fidalgo.pt@gmail.com>
Date: Mon, 1 Jun 2026 10:51:37 +0100
Subject: [PATCH 1/2] fix: respect noindex targets in sitemap link audit

  Carry resolved page headers and parsed HTML into link validation so
  Crawlscope can avoid reporting noindexed internal targets as indexable
  pages missing from the sitemap. Reuse the existing robots directive
  parsing from the indexability rule for both meta robots and
  X-Robots-Tag.

  Also exclude form markup from the visible-text-ratio HTML denominator,
  so form-heavy tool pages are not pushed toward filler copy just to
  offset autocomplete payloads or hidden control markup.
---
 lib/crawlscope/crawl.rb                      |  2 +
 lib/crawlscope/document_text.rb              |  9 +++-
 lib/crawlscope/rules/content_quality.rb      |  2 +-
 lib/crawlscope/rules/indexability.rb         | 34 +++++++++++---
 lib/crawlscope/rules/links.rb                |  6 +++
 test/crawlscope/content_quality_rule_test.rb | 18 ++++++++
 test/crawlscope/links_rule_test.rb           | 48 ++++++++++++++++++++
 7 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
index c0b7374..ad3af48 100644
--- a/lib/crawlscope/crawl.rb
+++ b/lib/crawlscope/crawl.rb
@@ -126,8 +126,10 @@ def resolved_page(normalized_url)
     def resolution(page, normalized_url, crawled:)
       {
         crawled: crawled,
+        doc: page.doc,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        headers: page.headers,
         html: page.html?,
         status: page.status
       }
diff --git a/lib/crawlscope/document_text.rb b/lib/crawlscope/document_text.rb
index dbbb3c3..ca4e6a0 100644
--- a/lib/crawlscope/document_text.rb
+++ b/lib/crawlscope/document_text.rb
@@ -3,6 +3,7 @@
 module Crawlscope
   module DocumentText
     REMOVED_SELECTORS = "script, style, noscript, template, svg"
+    CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
     TOKEN_PATTERN = /[[:alnum:]]+/
 
     module_function
@@ -15,6 +16,10 @@ def html_for(doc, selector: "main")
       root_for(doc, selector: selector)&.to_html.to_s
     end
 
+    def content_ratio_html_for(doc, selector: "main")
+      root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
+    end
+
     def text_for(doc, selector: "main")
       normalize(root_for(doc, selector: selector)&.text)
     end
@@ -27,11 +32,11 @@ def normalize(text)
       text.to_s.gsub(/\s+/, " ").strip
     end
 
-    def root_for(doc, selector:)
+    def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
       return unless doc
 
       copy = doc.dup
-      copy.css(REMOVED_SELECTORS).remove
+      copy.css(removed_selectors).remove
 
       root = selector.to_s.empty? ? nil : copy.at_css(selector)
       root || copy.at_css("body") || copy
diff --git a/lib/crawlscope/rules/content_quality.rb b/lib/crawlscope/rules/content_quality.rb
index 134078e..3fa76b9 100644
--- a/lib/crawlscope/rules/content_quality.rb
+++ b/lib/crawlscope/rules/content_quality.rb
@@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues)
       end
 
       def validate_visible_text_ratio(page, issues)
-        html_bytes = DocumentText.html_for(page.doc).bytesize
+        html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
         return if html_bytes.zero?
 
         visible_text = DocumentText.text_for(page.doc)
diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb
index 06aae36..00c446e 100644
--- a/lib/crawlscope/rules/indexability.rb
+++ b/lib/crawlscope/rules/indexability.rb
@@ -6,6 +6,31 @@ class Indexability
       ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
       X_ROBOTS_TAG_HEADER = "x-robots-tag"
 
+      def self.noindex_header?(headers)
+        noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
+      end
+
+      def self.noindex_meta?(doc)
+        return false unless doc
+
+        doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
+      end
+
+      def self.header_value(headers, name)
+        headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+      end
+
+      def self.directives(value)
+        value
+          .split(",")
+          .map { |directive| directive.split(":", 2).last.to_s.strip }
+          .reject(&:empty?)
+      end
+
+      def self.noindex?(value)
+        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+      end
+
       attr_reader :code
 
       def initialize
@@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls)
       end
 
       def header_value(page, name)
-        page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+        self.class.header_value(page.headers, name)
       end
 
       def directives(value)
-        value
-          .split(",")
-          .map { |directive| directive.split(":", 2).last.to_s.strip }
-          .reject(&:empty?)
+        self.class.directives(value)
       end
 
       def noindex?(value)
-        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+        self.class.noindex?(value)
       end
 
       def follow?(value)
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
index 5cf4b27..86cbe75 100644
--- a/lib/crawlscope/rules/links.rb
+++ b/lib/crawlscope/rules/links.rb
@@ -243,6 +243,11 @@ def html?
           resolution && resolution[:html]
         end
 
+        def noindex?
+          Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
+            Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
+        end
+
         def status
           resolution && resolution[:status]
         end
@@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
 
           target = resolve_target(final_url)
           next unless target.allowed?(@allowed_statuses) && target.html?
+          next if target.noindex?
 
           reported_urls << final_url
 
diff --git a/test/crawlscope/content_quality_rule_test.rb b/test/crawlscope/content_quality_rule_test.rb
index e6525d2..520aed2 100644
--- a/test/crawlscope/content_quality_rule_test.rb
+++ b/test/crawlscope/content_quality_rule_test.rb
@@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content
     refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
   end
 
+  def test_visible_text_ratio_ignores_form_payload_markup
+    issues = Crawlscope::IssueCollection.new
+    page = page_with(
+      main: <<~HTML
+        <p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
+        <form>
+          <div data-select-autocomplete-options-value="#{"x" * 50_000}">
+            <input type="text" name="country">
+          </div>
+        </form>
+      HTML
+    )
+
+    Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
+
+    refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
+  end
+
   def test_reports_low_unique_token_ratio_for_repetitive_content
     issues = Crawlscope::IssueCollection.new
     page = page_with(main: ("hotel location service " * 100).strip)
diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
index 6873ca3..78119ac 100644
--- a/test/crawlscope/links_rule_test.rb
+++ b/test/crawlscope/links_rule_test.rb
@@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap
     assert_equal "https://example.com/hidden", issue.url
   end
 
+  def test_does_not_report_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
+        error: nil,
+        final_url: target_url,
+        headers: {},
+        html: true,
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
+
+  def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<main>Hidden</main>"),
+        error: nil,
+        final_url: target_url,
+        headers: {"X-Robots-Tag" => "noindex"},
+        html: true,
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
+
   def test_reports_url_hygiene_issues
     issues = Crawlscope::IssueCollection.new
     long_path = "a" * 2_050

From c5c5b82bf4b015abcee15cb612e0eba75cd09142 Mon Sep 17 00:00:00 2001
From: Paulo Fidalgo <paulo.fidalgo.pt@gmail.com>
Date: Mon, 1 Jun 2026 11:34:04 +0100
Subject: [PATCH 2/2] fix: improve validation report readability

Group validation issues by category and code so large reports are easier to scan while still printing every issue. Keep issue rows compact and document the grouped output shape.

Verification: bundle exec ruby -Itest test/crawlscope/reporter_test.rb; bundle exec rake test; bundle exec rake standard
---
 README.md                        |  23 ++++++
 lib/crawlscope/reporter.rb       | 117 +++++++++++++++++++++++++++----
 test/crawlscope/reporter_test.rb |  67 +++++++++++++++---
 3 files changed, 184 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index b53a20d..e172560 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
 
 Child sitemap indexes are supported automatically.
 
+Validation output is grouped for terminal scanning:
+
+```text
+Crawlscope validation
+Base URL: https://example.com
+Sitemap: https://example.com/sitemap.xml
+URLs: 24
+Pages: 24
+Status: FAILED
+Issues: 3 3 warnings
+
+Summary:
+  links            2
+  metadata         1
+
+links / low_dofollow_inlinks: 2
+  - /pricing  inbound 1/2  sources: /
+  - /features  inbound 1/2  sources: /
+
+metadata / missing_title: 1
+  - /draft  missing <title>
+```
+
 ## Ruby Usage
 
 ```ruby
diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb
index d159aa9..9425c6f 100644
--- a/lib/crawlscope/reporter.rb
+++ b/lib/crawlscope/reporter.rb
@@ -1,5 +1,7 @@
 # frozen_string_literal: true
 
+require "uri"
+
 module Crawlscope
   class Reporter
     def initialize(io:)
@@ -19,30 +21,121 @@ def report(result)
       end
 
       @io.puts("Status: FAILED")
-      @io.puts("Issues: #{result.issues.size}")
+      @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}")
       @io.puts("")
 
-      report_grouped_issues("Severity", result.issues.by_severity)
+      report_summary(result.issues)
       @io.puts("")
-      report_grouped_issues("Category", result.issues.by_category)
+      report_issue_groups(result.issues, base_url: result.base_url)
     end
 
     private
 
-    def report_grouped_issues(title, grouped_issues)
-      @io.puts("#{title}:")
+    def severity_summary(issues)
+      grouped = issues.by_severity
+      return "" if grouped.empty?
+
+      grouped
+        .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
+        .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
+        .join(", ")
+    end
 
-      grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
-        @io.puts("#{name}: #{issues.size}")
-        issues.each do |issue|
-          @io.puts("  - #{offense(issue)}")
+    def report_summary(issues)
+      @io.puts("Summary:")
+
+      issues.by_category
+        .sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
+        .each do |category, category_issues|
+          @io.puts("  #{category.to_s.ljust(16)} #{category_issues.size}")
         end
+    end
+
+    def report_issue_groups(issues, base_url:)
+      grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
+
+      grouped
+        .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
+        .each do |(category, code), grouped_issues|
+          @io.puts("#{category} / #{code}: #{grouped_issues.size}")
+
+          grouped_issues.each do |issue|
+            @io.puts("  - #{compact_issue(issue, base_url: base_url)}")
+          end
+
+          @io.puts("")
+        end
+    end
+
+    def compact_issue(issue, base_url:)
+      parts = []
+      parts << relative_url(issue.url, base_url: base_url) if issue.url
+
+      detail = compact_detail(issue, base_url: base_url)
+      parts << detail unless detail.empty?
+
+      parts.compact.join("  ")
+    end
+
+    def compact_detail(issue, base_url:)
+      details = issue.details || {}
+      fragments = []
+
+      inbound = details[:dofollow_inbound_count] || details[:inbound_count]
+      fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
+
+      if details[:ratio] && details[:threshold]
+        fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
+      end
+
+      fragments << "count #{details[:count]}" if details[:count]
+      fragments << "length #{details[:length]}" if details[:length]
+      fragments << "status #{details[:status]}" if details[:status]
+      fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
+      fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
+      fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
+      fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
+
+      return issue.message if fragments.empty?
+
+      case issue.code
+      when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
+        fragments.join("  ")
+      else
+        ([issue.message] + fragments).join("  ")
       end
     end
 
-    def offense(issue)
-      parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
-      parts.compact.join(" ")
+    def relative_urls(urls, base_url:)
+      Array(urls).map { |url| relative_url(url, base_url: base_url) }
+    end
+
+    def relative_url(url, base_url:)
+      return url unless url && base_url
+
+      uri = URI.parse(url)
+      base_uri = URI.parse(base_url)
+
+      return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
+
+      relative = uri.path.to_s.empty? ? "/" : uri.path
+      relative += "?#{uri.query}" if uri.query
+      relative += "##{uri.fragment}" if uri.fragment
+      relative
+    rescue URI::InvalidURIError
+      url
+    end
+
+    def format_number(value)
+      return format("%.2f", value) if value.is_a?(Float)
+
+      value.to_s
+    end
+
+    def pluralize(word, count)
+      return word.to_s if count == 1
+
+      "#{word}s"
     end
   end
 end
diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb
index 2fcf75c..6a5c3d1 100644
--- a/test/crawlscope/reporter_test.rb
+++ b/test/crawlscope/reporter_test.rb
@@ -23,11 +23,25 @@ def test_reports_ok_result
     refute_includes output, "Status: FAILED"
   end
 
-  def test_reports_failed_result_with_grouped_counts_and_offenses
+  def test_reports_failed_result_with_grouped_one_line_issues
     io = StringIO.new
     issues = Crawlscope::IssueCollection.new
+    4.times do |index|
+      issues.add(
+        code: :low_dofollow_inlinks,
+        severity: :warning,
+        category: :links,
+        url: "https://example.com/page-#{index + 1}",
+        message: "dofollow inbound links 1 below 2",
+        details: {
+          dofollow_inbound_count: 1,
+          minimum: 2,
+          source_urls: ["https://example.com/source-#{index + 1}"]
+        }
+      )
+    end
     issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
-    issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
+
     result = Crawlscope::Result.new(
       base_url: "https://example.com",
       sitemap_path: "/tmp/sitemap.xml",
@@ -41,14 +55,45 @@ def test_reports_failed_result_with_grouped_counts_and_offenses
     output = io.string
 
     assert_includes output, "Status: FAILED"
-    assert_includes output, "Issues: 2"
-    assert_includes output, "Severity:"
-    assert_includes output, "notice: 1"
-    assert_includes output, "warning: 1"
-    assert_includes output, "Category:"
-    assert_includes output, "links: 1"
-    assert_includes output, "metadata: 1"
-    assert_includes output, "  - [warning] missing_title https://example.com/a missing <title>"
-    assert_includes output, "  - [notice] broken_internal_link https://example.com/b broken internal link"
+    assert_includes output, "Issues: 5 5 warnings"
+    assert_includes output, "Summary:"
+    assert_includes output, "links / low_dofollow_inlinks: 4"
+    assert_includes output, "  - /page-1  inbound 1/2  sources: /source-1"
+    assert_includes output, "  - /page-4  inbound 1/2  sources: /source-4"
+    assert_includes output, "metadata / missing_title: 1"
+    refute_includes output, "Severity:"
+    refute_includes output, "Category:"
+    refute_includes output, "... 1 more"
+  end
+
+  def test_reports_source_details_on_one_line
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    4.times do |index|
+      issues.add(
+        code: :indexable_page_missing_from_sitemap,
+        severity: :warning,
+        category: :sitemaps,
+        url: "https://example.com/overview-#{index + 1}",
+        message: "indexable internal page is missing from sitemap",
+        details: {source_url: "https://example.com/source-#{index + 1}"}
+      )
+    end
+
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com"],
+      pages: [Object.new],
+      issues: issues
+    )
+
+    Crawlscope::Reporter.new(io: io).report(result)
+
+    output = io.string
+
+    assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4"
+    assert_includes output, "  - /overview-1  indexable internal page is missing from sitemap  source: /source-1"
+    assert_includes output, "  - /overview-4  indexable internal page is missing from sitemap  source: /source-4"
   end
 end