From 75a20b4841276ddd9ac3256437b6921f8fdbb7e1 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Mon, 1 Jun 2026 10:51:37 +0100 Subject: [PATCH 1/2] fix: respect noindex targets in sitemap link audit Carry resolved page headers and parsed HTML into link validation so Crawlscope can avoid reporting noindexed internal targets as indexable pages missing from the sitemap. Reuse the existing robots directive parsing from the indexability rule for both meta robots and X-Robots-Tag. Also exclude form markup from the visible-text-ratio HTML denominator, so form-heavy tool pages are not pushed toward filler copy just to offset autocomplete payloads or hidden control markup. --- lib/crawlscope/crawl.rb | 2 + lib/crawlscope/document_text.rb | 9 +++- lib/crawlscope/rules/content_quality.rb | 2 +- lib/crawlscope/rules/indexability.rb | 34 +++++++++++--- lib/crawlscope/rules/links.rb | 6 +++ test/crawlscope/content_quality_rule_test.rb | 18 ++++++++ test/crawlscope/links_rule_test.rb | 48 ++++++++++++++++++++ 7 files changed, 110 insertions(+), 9 deletions(-) diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index c0b7374..ad3af48 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -126,8 +126,10 @@ def resolved_page(normalized_url) def resolution(page, normalized_url, crawled:) { crawled: crawled, + doc: page.doc, error: page.error, final_url: page.normalized_final_url || normalized_url, + headers: page.headers, html: page.html?, status: page.status } diff --git a/lib/crawlscope/document_text.rb b/lib/crawlscope/document_text.rb index dbbb3c3..ca4e6a0 100644 --- a/lib/crawlscope/document_text.rb +++ b/lib/crawlscope/document_text.rb @@ -3,6 +3,7 @@ module Crawlscope module DocumentText REMOVED_SELECTORS = "script, style, noscript, template, svg" + CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form" TOKEN_PATTERN = /[[:alnum:]]+/ module_function @@ -15,6 +16,10 @@ def html_for(doc, selector: "main") root_for(doc, selector: selector)&.to_html.to_s end + def content_ratio_html_for(doc, selector: "main") + root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s + end + def text_for(doc, selector: "main") normalize(root_for(doc, selector: selector)&.text) end @@ -27,11 +32,11 @@ def normalize(text) text.to_s.gsub(/\s+/, " ").strip end - def root_for(doc, selector:) + def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) return unless doc copy = doc.dup - copy.css(REMOVED_SELECTORS).remove + copy.css(removed_selectors).remove root = selector.to_s.empty? ? nil : copy.at_css(selector) root || copy.at_css("body") || copy diff --git a/lib/crawlscope/rules/content_quality.rb b/lib/crawlscope/rules/content_quality.rb index 134078e..3fa76b9 100644 --- a/lib/crawlscope/rules/content_quality.rb +++ b/lib/crawlscope/rules/content_quality.rb @@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues) end def validate_visible_text_ratio(page, issues) - html_bytes = DocumentText.html_for(page.doc).bytesize + html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize return if html_bytes.zero? visible_text = DocumentText.text_for(page.doc) diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb index 06aae36..00c446e 100644 --- a/lib/crawlscope/rules/indexability.rb +++ b/lib/crawlscope/rules/indexability.rb @@ -6,6 +6,31 @@ class Indexability ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]' X_ROBOTS_TAG_HEADER = "x-robots-tag" + def self.noindex_header?(headers) + noindex?(header_value(headers, X_ROBOTS_TAG_HEADER)) + end + + def self.noindex_meta?(doc) + return false unless doc + + doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) } + end + + def self.header_value(headers, name) + headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s + end + + def self.directives(value) + value + .split(",") + .map { |directive| directive.split(":", 2).last.to_s.strip } + .reject(&:empty?) + end + + def self.noindex?(value) + directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + end + attr_reader :code def initialize @@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls) end def header_value(page, name) - page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s + self.class.header_value(page.headers, name) end def directives(value) - value - .split(",") - .map { |directive| directive.split(":", 2).last.to_s.strip } - .reject(&:empty?) + self.class.directives(value) end def noindex?(value) - directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + self.class.noindex?(value) end def follow?(value) diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index 5cf4b27..86cbe75 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -243,6 +243,11 @@ def html? resolution && resolution[:html] end + def noindex? + Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) || + Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc]) + end + def status resolution && resolution[:status] end @@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues) target = resolve_target(final_url) next unless target.allowed?(@allowed_statuses) && target.html? + next if target.noindex? reported_urls << final_url diff --git a/test/crawlscope/content_quality_rule_test.rb b/test/crawlscope/content_quality_rule_test.rb index e6525d2..520aed2 100644 --- a/test/crawlscope/content_quality_rule_test.rb +++ b/test/crawlscope/content_quality_rule_test.rb @@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content refute_includes issues.to_a.map(&:code), :low_visible_text_ratio end + def test_visible_text_ratio_ignores_form_payload_markup + issues = Crawlscope::IssueCollection.new + page = page_with( + main: <<~HTML +

#{Array.new(260) { |index| "word#{index}" }.join(" ")}

+
+
+ +
+
+ HTML + ) + + Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues) + + refute_includes issues.to_a.map(&:code), :low_visible_text_ratio + end + def test_reports_low_unique_token_ratio_for_repetitive_content issues = Crawlscope::IssueCollection.new page = page_with(main: ("hotel location service " * 100).strip) diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index 6873ca3..78119ac 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap assert_equal "https://example.com/hidden", issue.url end + def test_does_not_report_noindex_internal_pages_missing_from_sitemap + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + doc: Nokogiri::HTML(""), + error: nil, + final_url: target_url, + headers: {}, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "
Hidden
")], + issues: issues, + context: context(resolver: resolver) + ) + + refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap + end + + def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + doc: Nokogiri::HTML("
Hidden
"), + error: nil, + final_url: target_url, + headers: {"X-Robots-Tag" => "noindex"}, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "
Hidden
")], + issues: issues, + context: context(resolver: resolver) + ) + + refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap + end + def test_reports_url_hygiene_issues issues = Crawlscope::IssueCollection.new long_path = "a" * 2_050 From c5c5b82bf4b015abcee15cb612e0eba75cd09142 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Mon, 1 Jun 2026 11:34:04 +0100 Subject: [PATCH 2/2] fix: improve validation report readability Group validation issues by category and code so large reports are easier to scan while still printing every issue. Keep issue rows compact and document the grouped output shape. Verification: bundle exec ruby -Itest test/crawlscope/reporter_test.rb; bundle exec rake test; bundle exec rake standard --- README.md | 23 ++++++ lib/crawlscope/reporter.rb | 117 +++++++++++++++++++++++++++---- test/crawlscope/reporter_test.rb | 67 +++++++++++++++--- 3 files changed, 184 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index b53a20d..e172560 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site Child sitemap indexes are supported automatically. +Validation output is grouped for terminal scanning: + +```text +Crawlscope validation +Base URL: https://example.com +Sitemap: https://example.com/sitemap.xml +URLs: 24 +Pages: 24 +Status: FAILED +Issues: 3 3 warnings + +Summary: + links 2 + metadata 1 + +links / low_dofollow_inlinks: 2 + - /pricing inbound 1/2 sources: / + - /features inbound 1/2 sources: / + +metadata / missing_title: 1 + - /draft missing +``` + ## Ruby Usage ```ruby diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb index d159aa9..9425c6f 100644 --- a/lib/crawlscope/reporter.rb +++ b/lib/crawlscope/reporter.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require "uri" + module Crawlscope class Reporter def initialize(io:) @@ -19,30 +21,121 @@ def report(result) end @io.puts("Status: FAILED") - @io.puts("Issues: #{result.issues.size}") + @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}") @io.puts("") - report_grouped_issues("Severity", result.issues.by_severity) + report_summary(result.issues) @io.puts("") - report_grouped_issues("Category", result.issues.by_category) + report_issue_groups(result.issues, base_url: result.base_url) end private - def report_grouped_issues(title, grouped_issues) - @io.puts("#{title}:") + def severity_summary(issues) + grouped = issues.by_severity + return "" if grouped.empty? + + grouped + .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] } + .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" } + .join(", ") + end - grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues| - @io.puts("#{name}: #{issues.size}") - issues.each do |issue| - @io.puts(" - #{offense(issue)}") + def report_summary(issues) + @io.puts("Summary:") + + issues.by_category + .sort_by { |category, category_issues| [-category_issues.size, category.to_s] } + .each do |category, category_issues| + @io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}") end + end + + def report_issue_groups(issues, base_url:) + grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] } + + grouped + .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] } + .each do |(category, code), grouped_issues| + @io.puts("#{category} / #{code}: #{grouped_issues.size}") + + grouped_issues.each do |issue| + @io.puts(" - #{compact_issue(issue, base_url: base_url)}") + end + + @io.puts("") + end + end + + def compact_issue(issue, base_url:) + parts = [] + parts << relative_url(issue.url, base_url: base_url) if issue.url + + detail = compact_detail(issue, base_url: base_url) + parts << detail unless detail.empty? + + parts.compact.join(" ") + end + + def compact_detail(issue, base_url:) + details = issue.details || {} + fragments = [] + + inbound = details[:dofollow_inbound_count] || details[:inbound_count] + fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum] + + if details[:ratio] && details[:threshold] + fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}" + end + + fragments << "count #{details[:count]}" if details[:count] + fragments << "length #{details[:length]}" if details[:length] + fragments << "status #{details[:status]}" if details[:status] + fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url] + fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any? + fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url] + fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any? + + return issue.message if fragments.empty? + + case issue.code + when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio + fragments.join(" ") + else + ([issue.message] + fragments).join(" ") end end - def offense(issue) - parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message] - parts.compact.join(" ") + def relative_urls(urls, base_url:) + Array(urls).map { |url| relative_url(url, base_url: base_url) } + end + + def relative_url(url, base_url:) + return url unless url && base_url + + uri = URI.parse(url) + base_uri = URI.parse(base_url) + + return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port + + relative = uri.path.to_s.empty? ? "/" : uri.path + relative += "?#{uri.query}" if uri.query + relative += "##{uri.fragment}" if uri.fragment + relative + rescue URI::InvalidURIError + url + end + + def format_number(value) + return format("%.2f", value) if value.is_a?(Float) + + value.to_s + end + + def pluralize(word, count) + return word.to_s if count == 1 + + "#{word}s" end end end diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb index 2fcf75c..6a5c3d1 100644 --- a/test/crawlscope/reporter_test.rb +++ b/test/crawlscope/reporter_test.rb @@ -23,11 +23,25 @@ def test_reports_ok_result refute_includes output, "Status: FAILED" end - def test_reports_failed_result_with_grouped_counts_and_offenses + def test_reports_failed_result_with_grouped_one_line_issues io = StringIO.new issues = Crawlscope::IssueCollection.new + 4.times do |index| + issues.add( + code: :low_dofollow_inlinks, + severity: :warning, + category: :links, + url: "https://example.com/page-#{index + 1}", + message: "dofollow inbound links 1 below 2", + details: { + dofollow_inbound_count: 1, + minimum: 2, + source_urls: ["https://example.com/source-#{index + 1}"] + } + ) + end issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {}) - issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {}) + result = Crawlscope::Result.new( base_url: "https://example.com", sitemap_path: "/tmp/sitemap.xml", @@ -41,14 +55,45 @@ def test_reports_failed_result_with_grouped_counts_and_offenses output = io.string assert_includes output, "Status: FAILED" - assert_includes output, "Issues: 2" - assert_includes output, "Severity:" - assert_includes output, "notice: 1" - assert_includes output, "warning: 1" - assert_includes output, "Category:" - assert_includes output, "links: 1" - assert_includes output, "metadata: 1" - assert_includes output, " - [warning] missing_title https://example.com/a missing <title>" - assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link" + assert_includes output, "Issues: 5 5 warnings" + assert_includes output, "Summary:" + assert_includes output, "links / low_dofollow_inlinks: 4" + assert_includes output, " - /page-1 inbound 1/2 sources: /source-1" + assert_includes output, " - /page-4 inbound 1/2 sources: /source-4" + assert_includes output, "metadata / missing_title: 1" + refute_includes output, "Severity:" + refute_includes output, "Category:" + refute_includes output, "... 1 more" + end + + def test_reports_source_details_on_one_line + io = StringIO.new + issues = Crawlscope::IssueCollection.new + 4.times do |index| + issues.add( + code: :indexable_page_missing_from_sitemap, + severity: :warning, + category: :sitemaps, + url: "https://example.com/overview-#{index + 1}", + message: "indexable internal page is missing from sitemap", + details: {source_url: "https://example.com/source-#{index + 1}"} + ) + end + + result = Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com"], + pages: [Object.new], + issues: issues + ) + + Crawlscope::Reporter.new(io: io).report(result) + + output = io.string + + assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4" + assert_includes output, " - /overview-1 indexable internal page is missing from sitemap source: /source-1" + assert_includes output, " - /overview-4 indexable internal page is missing from sitemap source: /source-4" end end