diff --git a/README.md b/README.md
index b53a20d..e172560 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
Child sitemap indexes are supported automatically.
+Validation output is grouped for terminal scanning:
+
+```text
+Crawlscope validation
+Base URL: https://example.com
+Sitemap: https://example.com/sitemap.xml
+URLs: 24
+Pages: 24
+Status: FAILED
+Issues: 3 3 warnings
+
+Summary:
+ links 2
+ metadata 1
+
+links / low_dofollow_inlinks: 2
+ - /pricing inbound 1/2 sources: /
+ - /features inbound 1/2 sources: /
+
+metadata / missing_title: 1
+ - /draft missing
+```
+
## Ruby Usage
```ruby
diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
index c0b7374..ad3af48 100644
--- a/lib/crawlscope/crawl.rb
+++ b/lib/crawlscope/crawl.rb
@@ -126,8 +126,10 @@ def resolved_page(normalized_url)
def resolution(page, normalized_url, crawled:)
{
crawled: crawled,
+ doc: page.doc,
error: page.error,
final_url: page.normalized_final_url || normalized_url,
+ headers: page.headers,
html: page.html?,
status: page.status
}
diff --git a/lib/crawlscope/document_text.rb b/lib/crawlscope/document_text.rb
index dbbb3c3..ca4e6a0 100644
--- a/lib/crawlscope/document_text.rb
+++ b/lib/crawlscope/document_text.rb
@@ -3,6 +3,7 @@
module Crawlscope
module DocumentText
REMOVED_SELECTORS = "script, style, noscript, template, svg"
+ CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
TOKEN_PATTERN = /[[:alnum:]]+/
module_function
@@ -15,6 +16,10 @@ def html_for(doc, selector: "main")
root_for(doc, selector: selector)&.to_html.to_s
end
+ def content_ratio_html_for(doc, selector: "main")
+ root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
+ end
+
def text_for(doc, selector: "main")
normalize(root_for(doc, selector: selector)&.text)
end
@@ -27,11 +32,11 @@ def normalize(text)
text.to_s.gsub(/\s+/, " ").strip
end
- def root_for(doc, selector:)
+ def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
return unless doc
copy = doc.dup
- copy.css(REMOVED_SELECTORS).remove
+ copy.css(removed_selectors).remove
root = selector.to_s.empty? ? nil : copy.at_css(selector)
root || copy.at_css("body") || copy
diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb
index d159aa9..9425c6f 100644
--- a/lib/crawlscope/reporter.rb
+++ b/lib/crawlscope/reporter.rb
@@ -1,5 +1,7 @@
# frozen_string_literal: true
+require "uri"
+
module Crawlscope
class Reporter
def initialize(io:)
@@ -19,30 +21,121 @@ def report(result)
end
@io.puts("Status: FAILED")
- @io.puts("Issues: #{result.issues.size}")
+ @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}")
@io.puts("")
- report_grouped_issues("Severity", result.issues.by_severity)
+ report_summary(result.issues)
@io.puts("")
- report_grouped_issues("Category", result.issues.by_category)
+ report_issue_groups(result.issues, base_url: result.base_url)
end
private
- def report_grouped_issues(title, grouped_issues)
- @io.puts("#{title}:")
+ def severity_summary(issues)
+ grouped = issues.by_severity
+ return "" if grouped.empty?
+
+ grouped
+ .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
+ .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
+ .join(", ")
+ end
- grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
- @io.puts("#{name}: #{issues.size}")
- issues.each do |issue|
- @io.puts(" - #{offense(issue)}")
+ def report_summary(issues)
+ @io.puts("Summary:")
+
+ issues.by_category
+ .sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
+ .each do |category, category_issues|
+ @io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}")
end
+ end
+
+ def report_issue_groups(issues, base_url:)
+ grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
+
+ grouped
+ .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
+ .each do |(category, code), grouped_issues|
+ @io.puts("#{category} / #{code}: #{grouped_issues.size}")
+
+ grouped_issues.each do |issue|
+ @io.puts(" - #{compact_issue(issue, base_url: base_url)}")
+ end
+
+ @io.puts("")
+ end
+ end
+
+ def compact_issue(issue, base_url:)
+ parts = []
+ parts << relative_url(issue.url, base_url: base_url) if issue.url
+
+ detail = compact_detail(issue, base_url: base_url)
+ parts << detail unless detail.empty?
+
+ parts.compact.join(" ")
+ end
+
+ def compact_detail(issue, base_url:)
+ details = issue.details || {}
+ fragments = []
+
+ inbound = details[:dofollow_inbound_count] || details[:inbound_count]
+ fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
+
+ if details[:ratio] && details[:threshold]
+ fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
+ end
+
+ fragments << "count #{details[:count]}" if details[:count]
+ fragments << "length #{details[:length]}" if details[:length]
+ fragments << "status #{details[:status]}" if details[:status]
+ fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
+ fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
+ fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
+ fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
+
+ return issue.message if fragments.empty?
+
+ case issue.code
+ when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
+ fragments.join(" ")
+ else
+ ([issue.message] + fragments).join(" ")
end
end
- def offense(issue)
- parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
- parts.compact.join(" ")
+ def relative_urls(urls, base_url:)
+ Array(urls).map { |url| relative_url(url, base_url: base_url) }
+ end
+
+ def relative_url(url, base_url:)
+ return url unless url && base_url
+
+ uri = URI.parse(url)
+ base_uri = URI.parse(base_url)
+
+ return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
+
+ relative = uri.path.to_s.empty? ? "/" : uri.path
+ relative += "?#{uri.query}" if uri.query
+ relative += "##{uri.fragment}" if uri.fragment
+ relative
+ rescue URI::InvalidURIError
+ url
+ end
+
+ def format_number(value)
+ return format("%.2f", value) if value.is_a?(Float)
+
+ value.to_s
+ end
+
+ def pluralize(word, count)
+ return word.to_s if count == 1
+
+ "#{word}s"
end
end
end
diff --git a/lib/crawlscope/rules/content_quality.rb b/lib/crawlscope/rules/content_quality.rb
index 134078e..3fa76b9 100644
--- a/lib/crawlscope/rules/content_quality.rb
+++ b/lib/crawlscope/rules/content_quality.rb
@@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues)
end
def validate_visible_text_ratio(page, issues)
- html_bytes = DocumentText.html_for(page.doc).bytesize
+ html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
return if html_bytes.zero?
visible_text = DocumentText.text_for(page.doc)
diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb
index 06aae36..00c446e 100644
--- a/lib/crawlscope/rules/indexability.rb
+++ b/lib/crawlscope/rules/indexability.rb
@@ -6,6 +6,31 @@ class Indexability
ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
X_ROBOTS_TAG_HEADER = "x-robots-tag"
+ def self.noindex_header?(headers)
+ noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
+ end
+
+ def self.noindex_meta?(doc)
+ return false unless doc
+
+ doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
+ end
+
+ def self.header_value(headers, name)
+ headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+ end
+
+ def self.directives(value)
+ value
+ .split(",")
+ .map { |directive| directive.split(":", 2).last.to_s.strip }
+ .reject(&:empty?)
+ end
+
+ def self.noindex?(value)
+ directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+ end
+
attr_reader :code
def initialize
@@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls)
end
def header_value(page, name)
- page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+ self.class.header_value(page.headers, name)
end
def directives(value)
- value
- .split(",")
- .map { |directive| directive.split(":", 2).last.to_s.strip }
- .reject(&:empty?)
+ self.class.directives(value)
end
def noindex?(value)
- directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+ self.class.noindex?(value)
end
def follow?(value)
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
index 5cf4b27..86cbe75 100644
--- a/lib/crawlscope/rules/links.rb
+++ b/lib/crawlscope/rules/links.rb
@@ -243,6 +243,11 @@ def html?
resolution && resolution[:html]
end
+ def noindex?
+ Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
+ Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
+ end
+
def status
resolution && resolution[:status]
end
@@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
target = resolve_target(final_url)
next unless target.allowed?(@allowed_statuses) && target.html?
+ next if target.noindex?
reported_urls << final_url
diff --git a/test/crawlscope/content_quality_rule_test.rb b/test/crawlscope/content_quality_rule_test.rb
index e6525d2..520aed2 100644
--- a/test/crawlscope/content_quality_rule_test.rb
+++ b/test/crawlscope/content_quality_rule_test.rb
@@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content
refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
end
+ def test_visible_text_ratio_ignores_form_payload_markup
+ issues = Crawlscope::IssueCollection.new
+ page = page_with(
+ main: <<~HTML
+ #{Array.new(260) { |index| "word#{index}" }.join(" ")}
+
+ HTML
+ )
+
+ Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
+
+ refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
+ end
+
def test_reports_low_unique_token_ratio_for_repetitive_content
issues = Crawlscope::IssueCollection.new
page = page_with(main: ("hotel location service " * 100).strip)
diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
index 6873ca3..78119ac 100644
--- a/test/crawlscope/links_rule_test.rb
+++ b/test/crawlscope/links_rule_test.rb
@@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap
assert_equal "https://example.com/hidden", issue.url
end
+ def test_does_not_report_noindex_internal_pages_missing_from_sitemap
+ issues = Crawlscope::IssueCollection.new
+ resolver = lambda do |target_url|
+ {
+ crawled: false,
+ doc: Nokogiri::HTML(""),
+ error: nil,
+ final_url: target_url,
+ headers: {},
+ html: true,
+ status: 200
+ }
+ end
+
+ Crawlscope::Rules::Links.new.call(
+ urls: ["https://example.com/guide"],
+ pages: [page(url: "https://example.com/guide", body: "Hidden")],
+ issues: issues,
+ context: context(resolver: resolver)
+ )
+
+ refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+ end
+
+ def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
+ issues = Crawlscope::IssueCollection.new
+ resolver = lambda do |target_url|
+ {
+ crawled: false,
+ doc: Nokogiri::HTML("Hidden"),
+ error: nil,
+ final_url: target_url,
+ headers: {"X-Robots-Tag" => "noindex"},
+ html: true,
+ status: 200
+ }
+ end
+
+ Crawlscope::Rules::Links.new.call(
+ urls: ["https://example.com/guide"],
+ pages: [page(url: "https://example.com/guide", body: "Hidden")],
+ issues: issues,
+ context: context(resolver: resolver)
+ )
+
+ refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+ end
+
def test_reports_url_hygiene_issues
issues = Crawlscope::IssueCollection.new
long_path = "a" * 2_050
diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb
index 2fcf75c..6a5c3d1 100644
--- a/test/crawlscope/reporter_test.rb
+++ b/test/crawlscope/reporter_test.rb
@@ -23,11 +23,25 @@ def test_reports_ok_result
refute_includes output, "Status: FAILED"
end
- def test_reports_failed_result_with_grouped_counts_and_offenses
+ def test_reports_failed_result_with_grouped_one_line_issues
io = StringIO.new
issues = Crawlscope::IssueCollection.new
+ 4.times do |index|
+ issues.add(
+ code: :low_dofollow_inlinks,
+ severity: :warning,
+ category: :links,
+ url: "https://example.com/page-#{index + 1}",
+ message: "dofollow inbound links 1 below 2",
+ details: {
+ dofollow_inbound_count: 1,
+ minimum: 2,
+ source_urls: ["https://example.com/source-#{index + 1}"]
+ }
+ )
+ end
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing ", details: {})
- issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
+
result = Crawlscope::Result.new(
base_url: "https://example.com",
sitemap_path: "/tmp/sitemap.xml",
@@ -41,14 +55,45 @@ def test_reports_failed_result_with_grouped_counts_and_offenses
output = io.string
assert_includes output, "Status: FAILED"
- assert_includes output, "Issues: 2"
- assert_includes output, "Severity:"
- assert_includes output, "notice: 1"
- assert_includes output, "warning: 1"
- assert_includes output, "Category:"
- assert_includes output, "links: 1"
- assert_includes output, "metadata: 1"
- assert_includes output, " - [warning] missing_title https://example.com/a missing "
- assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link"
+ assert_includes output, "Issues: 5 5 warnings"
+ assert_includes output, "Summary:"
+ assert_includes output, "links / low_dofollow_inlinks: 4"
+ assert_includes output, " - /page-1 inbound 1/2 sources: /source-1"
+ assert_includes output, " - /page-4 inbound 1/2 sources: /source-4"
+ assert_includes output, "metadata / missing_title: 1"
+ refute_includes output, "Severity:"
+ refute_includes output, "Category:"
+ refute_includes output, "... 1 more"
+ end
+
+ def test_reports_source_details_on_one_line
+ io = StringIO.new
+ issues = Crawlscope::IssueCollection.new
+ 4.times do |index|
+ issues.add(
+ code: :indexable_page_missing_from_sitemap,
+ severity: :warning,
+ category: :sitemaps,
+ url: "https://example.com/overview-#{index + 1}",
+ message: "indexable internal page is missing from sitemap",
+ details: {source_url: "https://example.com/source-#{index + 1}"}
+ )
+ end
+
+ result = Crawlscope::Result.new(
+ base_url: "https://example.com",
+ sitemap_path: "/tmp/sitemap.xml",
+ urls: ["https://example.com"],
+ pages: [Object.new],
+ issues: issues
+ )
+
+ Crawlscope::Reporter.new(io: io).report(result)
+
+ output = io.string
+
+ assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4"
+ assert_includes output, " - /overview-1 indexable internal page is missing from sitemap source: /source-1"
+ assert_includes output, " - /overview-4 indexable internal page is missing from sitemap source: /source-4"
end
end