diff --git a/README.md b/README.md index b53a20d..e172560 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site Child sitemap indexes are supported automatically. +Validation output is grouped for terminal scanning: + +```text +Crawlscope validation +Base URL: https://example.com +Sitemap: https://example.com/sitemap.xml +URLs: 24 +Pages: 24 +Status: FAILED +Issues: 3 3 warnings + +Summary: + links 2 + metadata 1 + +links / low_dofollow_inlinks: 2 + - /pricing inbound 1/2 sources: / + - /features inbound 1/2 sources: / + +metadata / missing_title: 1 + - /draft missing +``` + ## Ruby Usage ```ruby diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index c0b7374..ad3af48 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -126,8 +126,10 @@ def resolved_page(normalized_url) def resolution(page, normalized_url, crawled:) { crawled: crawled, + doc: page.doc, error: page.error, final_url: page.normalized_final_url || normalized_url, + headers: page.headers, html: page.html?, status: page.status } diff --git a/lib/crawlscope/document_text.rb b/lib/crawlscope/document_text.rb index dbbb3c3..ca4e6a0 100644 --- a/lib/crawlscope/document_text.rb +++ b/lib/crawlscope/document_text.rb @@ -3,6 +3,7 @@ module Crawlscope module DocumentText REMOVED_SELECTORS = "script, style, noscript, template, svg" + CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form" TOKEN_PATTERN = /[[:alnum:]]+/ module_function @@ -15,6 +16,10 @@ def html_for(doc, selector: "main") root_for(doc, selector: selector)&.to_html.to_s end + def content_ratio_html_for(doc, selector: "main") + root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s + end + def text_for(doc, selector: "main") normalize(root_for(doc, selector: selector)&.text) end @@ -27,11 +32,11 @@ def normalize(text) text.to_s.gsub(/\s+/, " ").strip end - def root_for(doc, selector:) + def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) return unless doc copy = doc.dup - copy.css(REMOVED_SELECTORS).remove + copy.css(removed_selectors).remove root = selector.to_s.empty? ? nil : copy.at_css(selector) root || copy.at_css("body") || copy diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb index d159aa9..9425c6f 100644 --- a/lib/crawlscope/reporter.rb +++ b/lib/crawlscope/reporter.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require "uri" + module Crawlscope class Reporter def initialize(io:) @@ -19,30 +21,121 @@ def report(result) end @io.puts("Status: FAILED") - @io.puts("Issues: #{result.issues.size}") + @io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}") @io.puts("") - report_grouped_issues("Severity", result.issues.by_severity) + report_summary(result.issues) @io.puts("") - report_grouped_issues("Category", result.issues.by_category) + report_issue_groups(result.issues, base_url: result.base_url) end private - def report_grouped_issues(title, grouped_issues) - @io.puts("#{title}:") + def severity_summary(issues) + grouped = issues.by_severity + return "" if grouped.empty? + + grouped + .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] } + .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" } + .join(", ") + end - grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues| - @io.puts("#{name}: #{issues.size}") - issues.each do |issue| - @io.puts(" - #{offense(issue)}") + def report_summary(issues) + @io.puts("Summary:") + + issues.by_category + .sort_by { |category, category_issues| [-category_issues.size, category.to_s] } + .each do |category, category_issues| + @io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}") end + end + + def report_issue_groups(issues, base_url:) + grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] } + + grouped + .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] } + .each do |(category, code), grouped_issues| + @io.puts("#{category} / #{code}: #{grouped_issues.size}") + + grouped_issues.each do |issue| + @io.puts(" - #{compact_issue(issue, base_url: base_url)}") + end + + @io.puts("") + end + end + + def compact_issue(issue, base_url:) + parts = [] + parts << relative_url(issue.url, base_url: base_url) if issue.url + + detail = compact_detail(issue, base_url: base_url) + parts << detail unless detail.empty? + + parts.compact.join(" ") + end + + def compact_detail(issue, base_url:) + details = issue.details || {} + fragments = [] + + inbound = details[:dofollow_inbound_count] || details[:inbound_count] + fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum] + + if details[:ratio] && details[:threshold] + fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}" + end + + fragments << "count #{details[:count]}" if details[:count] + fragments << "length #{details[:length]}" if details[:length] + fragments << "status #{details[:status]}" if details[:status] + fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url] + fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any? + fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url] + fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any? + + return issue.message if fragments.empty? + + case issue.code + when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio + fragments.join(" ") + else + ([issue.message] + fragments).join(" ") end end - def offense(issue) - parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message] - parts.compact.join(" ") + def relative_urls(urls, base_url:) + Array(urls).map { |url| relative_url(url, base_url: base_url) } + end + + def relative_url(url, base_url:) + return url unless url && base_url + + uri = URI.parse(url) + base_uri = URI.parse(base_url) + + return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port + + relative = uri.path.to_s.empty? ? "/" : uri.path + relative += "?#{uri.query}" if uri.query + relative += "##{uri.fragment}" if uri.fragment + relative + rescue URI::InvalidURIError + url + end + + def format_number(value) + return format("%.2f", value) if value.is_a?(Float) + + value.to_s + end + + def pluralize(word, count) + return word.to_s if count == 1 + + "#{word}s" end end end diff --git a/lib/crawlscope/rules/content_quality.rb b/lib/crawlscope/rules/content_quality.rb index 134078e..3fa76b9 100644 --- a/lib/crawlscope/rules/content_quality.rb +++ b/lib/crawlscope/rules/content_quality.rb @@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues) end def validate_visible_text_ratio(page, issues) - html_bytes = DocumentText.html_for(page.doc).bytesize + html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize return if html_bytes.zero? visible_text = DocumentText.text_for(page.doc) diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb index 06aae36..00c446e 100644 --- a/lib/crawlscope/rules/indexability.rb +++ b/lib/crawlscope/rules/indexability.rb @@ -6,6 +6,31 @@ class Indexability ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]' X_ROBOTS_TAG_HEADER = "x-robots-tag" + def self.noindex_header?(headers) + noindex?(header_value(headers, X_ROBOTS_TAG_HEADER)) + end + + def self.noindex_meta?(doc) + return false unless doc + + doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) } + end + + def self.header_value(headers, name) + headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s + end + + def self.directives(value) + value + .split(",") + .map { |directive| directive.split(":", 2).last.to_s.strip } + .reject(&:empty?) + end + + def self.noindex?(value) + directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + end + attr_reader :code def initialize @@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls) end def header_value(page, name) - page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s + self.class.header_value(page.headers, name) end def directives(value) - value - .split(",") - .map { |directive| directive.split(":", 2).last.to_s.strip } - .reject(&:empty?) + self.class.directives(value) end def noindex?(value) - directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + self.class.noindex?(value) end def follow?(value) diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index 5cf4b27..86cbe75 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -243,6 +243,11 @@ def html? resolution && resolution[:html] end + def noindex? + Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) || + Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc]) + end + def status resolution && resolution[:status] end @@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues) target = resolve_target(final_url) next unless target.allowed?(@allowed_statuses) && target.html? + next if target.noindex? reported_urls << final_url diff --git a/test/crawlscope/content_quality_rule_test.rb b/test/crawlscope/content_quality_rule_test.rb index e6525d2..520aed2 100644 --- a/test/crawlscope/content_quality_rule_test.rb +++ b/test/crawlscope/content_quality_rule_test.rb @@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content refute_includes issues.to_a.map(&:code), :low_visible_text_ratio end + def test_visible_text_ratio_ignores_form_payload_markup + issues = Crawlscope::IssueCollection.new + page = page_with( + main: <<~HTML + <p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p> + <form> + <div data-select-autocomplete-options-value="#{"x" * 50_000}"> + <input type="text" name="country"> + </div> + </form> + HTML + ) + + Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues) + + refute_includes issues.to_a.map(&:code), :low_visible_text_ratio + end + def test_reports_low_unique_token_ratio_for_repetitive_content issues = Crawlscope::IssueCollection.new page = page_with(main: ("hotel location service " * 100).strip) diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index 6873ca3..78119ac 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap assert_equal "https://example.com/hidden", issue.url end + def test_does_not_report_noindex_internal_pages_missing_from_sitemap + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"), + error: nil, + final_url: target_url, + headers: {}, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")], + issues: issues, + context: context(resolver: resolver) + ) + + refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap + end + + def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + doc: Nokogiri::HTML("<main>Hidden</main>"), + error: nil, + final_url: target_url, + headers: {"X-Robots-Tag" => "noindex"}, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")], + issues: issues, + context: context(resolver: resolver) + ) + + refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap + end + def test_reports_url_hygiene_issues issues = Crawlscope::IssueCollection.new long_path = "a" * 2_050 diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb index 2fcf75c..6a5c3d1 100644 --- a/test/crawlscope/reporter_test.rb +++ b/test/crawlscope/reporter_test.rb @@ -23,11 +23,25 @@ def test_reports_ok_result refute_includes output, "Status: FAILED" end - def test_reports_failed_result_with_grouped_counts_and_offenses + def test_reports_failed_result_with_grouped_one_line_issues io = StringIO.new issues = Crawlscope::IssueCollection.new + 4.times do |index| + issues.add( + code: :low_dofollow_inlinks, + severity: :warning, + category: :links, + url: "https://example.com/page-#{index + 1}", + message: "dofollow inbound links 1 below 2", + details: { + dofollow_inbound_count: 1, + minimum: 2, + source_urls: ["https://example.com/source-#{index + 1}"] + } + ) + end issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {}) - issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {}) + result = Crawlscope::Result.new( base_url: "https://example.com", sitemap_path: "/tmp/sitemap.xml", @@ -41,14 +55,45 @@ def test_reports_failed_result_with_grouped_counts_and_offenses output = io.string assert_includes output, "Status: FAILED" - assert_includes output, "Issues: 2" - assert_includes output, "Severity:" - assert_includes output, "notice: 1" - assert_includes output, "warning: 1" - assert_includes output, "Category:" - assert_includes output, "links: 1" - assert_includes output, "metadata: 1" - assert_includes output, " - [warning] missing_title https://example.com/a missing <title>" - assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link" + assert_includes output, "Issues: 5 5 warnings" + assert_includes output, "Summary:" + assert_includes output, "links / low_dofollow_inlinks: 4" + assert_includes output, " - /page-1 inbound 1/2 sources: /source-1" + assert_includes output, " - /page-4 inbound 1/2 sources: /source-4" + assert_includes output, "metadata / missing_title: 1" + refute_includes output, "Severity:" + refute_includes output, "Category:" + refute_includes output, "... 1 more" + end + + def test_reports_source_details_on_one_line + io = StringIO.new + issues = Crawlscope::IssueCollection.new + 4.times do |index| + issues.add( + code: :indexable_page_missing_from_sitemap, + severity: :warning, + category: :sitemaps, + url: "https://example.com/overview-#{index + 1}", + message: "indexable internal page is missing from sitemap", + details: {source_url: "https://example.com/source-#{index + 1}"} + ) + end + + result = Crawlscope::Result.new( + base_url: "https://example.com", + sitemap_path: "/tmp/sitemap.xml", + urls: ["https://example.com"], + pages: [Object.new], + issues: issues + ) + + Crawlscope::Reporter.new(io: io).report(result) + + output = io.string + + assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4" + assert_includes output, " - /overview-1 indexable internal page is missing from sitemap source: /source-1" + assert_includes output, " - /overview-4 indexable internal page is missing from sitemap source: /source-4" end end