ethos-link · fidalgo · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/README.md b/README.md
@@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
 
 Child sitemap indexes are supported automatically.
 
+Validation output is grouped for terminal scanning:
+
+```text
+Crawlscope validation
+Base URL: https://example.com
+Sitemap: https://example.com/sitemap.xml
+URLs: 24
+Pages: 24
+Status: FAILED
+Issues: 3 3 warnings
+
+Summary:
+  links            2
+  metadata         1
+
+links / low_dofollow_inlinks: 2
+  - /pricing  inbound 1/2  sources: /
+  - /features  inbound 1/2  sources: /
+
+metadata / missing_title: 1
+  - /draft  missing <title>
+```
+
 ## Ruby Usage
 
 ```ruby

diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
@@ -126,8 +126,10 @@ def resolved_page(normalized_url)
     def resolution(page, normalized_url, crawled:)
       {
         crawled: crawled,
+        doc: page.doc,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        headers: page.headers,
         html: page.html?,
         status: page.status
       }

diff --git a/lib/crawlscope/document_text.rb b/lib/crawlscope/document_text.rb
@@ -3,6 +3,7 @@
 module Crawlscope
   module DocumentText
     REMOVED_SELECTORS = "script, style, noscript, template, svg"
+    CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
     TOKEN_PATTERN = /[[:alnum:]]+/
 
     module_function
@@ -15,6 +16,10 @@ def html_for(doc, selector: "main")
       root_for(doc, selector: selector)&.to_html.to_s
     end
 
+    def content_ratio_html_for(doc, selector: "main")
+      root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
+    end
+
     def text_for(doc, selector: "main")
       normalize(root_for(doc, selector: selector)&.text)
     end
@@ -27,11 +32,11 @@ def normalize(text)
       text.to_s.gsub(/\s+/, " ").strip
     end
 
-    def root_for(doc, selector:)
+    def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
       return unless doc
 
       copy = doc.dup
-      copy.css(REMOVED_SELECTORS).remove
+      copy.css(removed_selectors).remove
 
       root = selector.to_s.empty? ? nil : copy.at_css(selector)
       root || copy.at_css("body") || copy

diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb
@@ -1,5 +1,7 @@
 # frozen_string_literal: true
 
+require "uri"
+
 module Crawlscope
   class Reporter
     def initialize(io:)
@@ -19,30 +21,121 @@ def report(result)
       end
 
       @io.puts("Status: FAILED")
-      @io.puts("Issues: #{result.issues.size}")
+      @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}")
       @io.puts("")
 
-      report_grouped_issues("Severity", result.issues.by_severity)
+      report_summary(result.issues)
       @io.puts("")
-      report_grouped_issues("Category", result.issues.by_category)
+      report_issue_groups(result.issues, base_url: result.base_url)
     end
 
     private
 
-    def report_grouped_issues(title, grouped_issues)
-      @io.puts("#{title}:")
+    def severity_summary(issues)
+      grouped = issues.by_severity
+      return "" if grouped.empty?
+
+      grouped
+        .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
+        .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
+        .join(", ")
+    end
 
-      grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
-        @io.puts("#{name}: #{issues.size}")
-        issues.each do |issue|
-          @io.puts("  - #{offense(issue)}")
+    def report_summary(issues)
+      @io.puts("Summary:")
+
+      issues.by_category
+        .sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
+        .each do |category, category_issues|
+          @io.puts("  #{category.to_s.ljust(16)} #{category_issues.size}")
         end
+    end
+
+    def report_issue_groups(issues, base_url:)
+      grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
+
+      grouped
+        .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
+        .each do |(category, code), grouped_issues|
+          @io.puts("#{category} / #{code}: #{grouped_issues.size}")
+
+          grouped_issues.each do |issue|
+            @io.puts("  - #{compact_issue(issue, base_url: base_url)}")
+          end
+
+          @io.puts("")
+        end
+    end
+
+    def compact_issue(issue, base_url:)
+      parts = []
+      parts << relative_url(issue.url, base_url: base_url) if issue.url
+
+      detail = compact_detail(issue, base_url: base_url)
+      parts << detail unless detail.empty?
+
+      parts.compact.join("  ")
+    end
+
+    def compact_detail(issue, base_url:)
+      details = issue.details || {}
+      fragments = []
+
+      inbound = details[:dofollow_inbound_count] || details[:inbound_count]
+      fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
+
+      if details[:ratio] && details[:threshold]
+        fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
+      end
+
+      fragments << "count #{details[:count]}" if details[:count]
+      fragments << "length #{details[:length]}" if details[:length]
+      fragments << "status #{details[:status]}" if details[:status]
+      fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
+      fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
+      fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
+      fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
+
+      return issue.message if fragments.empty?
+
+      case issue.code
+      when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
+        fragments.join("  ")
+      else
+        ([issue.message] + fragments).join("  ")
       end
     end
 
-    def offense(issue)
-      parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
-      parts.compact.join(" ")
+    def relative_urls(urls, base_url:)
+      Array(urls).map { |url| relative_url(url, base_url: base_url) }
+    end
+
+    def relative_url(url, base_url:)
+      return url unless url && base_url
+
+      uri = URI.parse(url)
+      base_uri = URI.parse(base_url)
+
+      return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
+
+      relative = uri.path.to_s.empty? ? "/" : uri.path
+      relative += "?#{uri.query}" if uri.query
+      relative += "##{uri.fragment}" if uri.fragment
+      relative
+    rescue URI::InvalidURIError
+      url
+    end
+
+    def format_number(value)
+      return format("%.2f", value) if value.is_a?(Float)
+
+      value.to_s
+    end
+
+    def pluralize(word, count)
+      return word.to_s if count == 1
+
+      "#{word}s"
     end
   end
 end
diff --git a/lib/crawlscope/rules/content_quality.rb b/lib/crawlscope/rules/content_quality.rb
@@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues)
       end
 
       def validate_visible_text_ratio(page, issues)
-        html_bytes = DocumentText.html_for(page.doc).bytesize
+        html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
         return if html_bytes.zero?
 
         visible_text = DocumentText.text_for(page.doc)

diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb
@@ -6,6 +6,31 @@ class Indexability
       ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
       X_ROBOTS_TAG_HEADER = "x-robots-tag"
 
+      def self.noindex_header?(headers)
+        noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
+      end
+
+      def self.noindex_meta?(doc)
+        return false unless doc
+
+        doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
+      end
+
+      def self.header_value(headers, name)
+        headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+      end
+
+      def self.directives(value)
+        value
+          .split(",")
+          .map { |directive| directive.split(":", 2).last.to_s.strip }
+          .reject(&:empty?)
+      end
+
+      def self.noindex?(value)
+        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+      end
+
       attr_reader :code
 
       def initialize
@@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls)
       end
 
       def header_value(page, name)
-        page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+        self.class.header_value(page.headers, name)
       end
 
       def directives(value)
-        value
-          .split(",")
-          .map { |directive| directive.split(":", 2).last.to_s.strip }
-          .reject(&:empty?)
+        self.class.directives(value)
       end
 
       def noindex?(value)
-        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+        self.class.noindex?(value)
       end
 
       def follow?(value)

diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
@@ -243,6 +243,11 @@ def html?
           resolution && resolution[:html]
         end
 
+        def noindex?
+          Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
+            Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
+        end
+
         def status
           resolution && resolution[:status]
         end
@@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
 
           target = resolve_target(final_url)
           next unless target.allowed?(@allowed_statuses) && target.html?
+          next if target.noindex?
 
           reported_urls << final_url
 

diff --git a/test/crawlscope/content_quality_rule_test.rb b/test/crawlscope/content_quality_rule_test.rb
@@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content
     refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
   end
 
+  def test_visible_text_ratio_ignores_form_payload_markup
+    issues = Crawlscope::IssueCollection.new
+    page = page_with(
+      main: <<~HTML
+        <p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
+        <form>
+          <div data-select-autocomplete-options-value="#{"x" * 50_000}">
+            <input type="text" name="country">
+          </div>
+        </form>
+      HTML
+    )
+
+    Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
+
+    refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
+  end
+
   def test_reports_low_unique_token_ratio_for_repetitive_content
     issues = Crawlscope::IssueCollection.new
     page = page_with(main: ("hotel location service " * 100).strip)

diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
@@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap
     assert_equal "https://example.com/hidden", issue.url
   end
 
+  def test_does_not_report_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
+        error: nil,
+        final_url: target_url,
+        headers: {},
+        html: true,
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
+
+  def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<main>Hidden</main>"),
+        error: nil,
+        final_url: target_url,
+        headers: {"X-Robots-Tag" => "noindex"},
+        html: true,
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
+
   def test_reports_url_hygiene_issues
     issues = Crawlscope::IssueCollection.new
     long_path = "a" * 2_050