Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site

Child sitemap indexes are supported automatically.

Validation output is grouped for terminal scanning:

```text
Crawlscope validation
Base URL: https://example.com
Sitemap: https://example.com/sitemap.xml
URLs: 24
Pages: 24
Status: FAILED
Issues: 3 3 warnings

Summary:
links 2
metadata 1

links / low_dofollow_inlinks: 2
- /pricing inbound 1/2 sources: /
- /features inbound 1/2 sources: /

metadata / missing_title: 1
- /draft missing <title>
```

## Ruby Usage

```ruby
Expand Down
2 changes: 2 additions & 0 deletions lib/crawlscope/crawl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,10 @@ def resolved_page(normalized_url)
def resolution(page, normalized_url, crawled:)
{
crawled: crawled,
doc: page.doc,
error: page.error,
final_url: page.normalized_final_url || normalized_url,
headers: page.headers,
html: page.html?,
status: page.status
}
Expand Down
9 changes: 7 additions & 2 deletions lib/crawlscope/document_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
module Crawlscope
module DocumentText
REMOVED_SELECTORS = "script, style, noscript, template, svg"
CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
TOKEN_PATTERN = /[[:alnum:]]+/

module_function
Expand All @@ -15,6 +16,10 @@ def html_for(doc, selector: "main")
root_for(doc, selector: selector)&.to_html.to_s
end

def content_ratio_html_for(doc, selector: "main")
root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
end

def text_for(doc, selector: "main")
normalize(root_for(doc, selector: selector)&.text)
end
Expand All @@ -27,11 +32,11 @@ def normalize(text)
text.to_s.gsub(/\s+/, " ").strip
end

def root_for(doc, selector:)
def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
return unless doc

copy = doc.dup
copy.css(REMOVED_SELECTORS).remove
copy.css(removed_selectors).remove

root = selector.to_s.empty? ? nil : copy.at_css(selector)
root || copy.at_css("body") || copy
Expand Down
117 changes: 105 additions & 12 deletions lib/crawlscope/reporter.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# frozen_string_literal: true

require "uri"

module Crawlscope
class Reporter
def initialize(io:)
Expand All @@ -19,30 +21,121 @@ def report(result)
end

@io.puts("Status: FAILED")
@io.puts("Issues: #{result.issues.size}")
@io.puts("Issues: #{result.issues.size} #{severity_summary(result.issues)}")
@io.puts("")

report_grouped_issues("Severity", result.issues.by_severity)
report_summary(result.issues)
@io.puts("")
report_grouped_issues("Category", result.issues.by_category)
report_issue_groups(result.issues, base_url: result.base_url)
end

private

def report_grouped_issues(title, grouped_issues)
@io.puts("#{title}:")
def severity_summary(issues)
grouped = issues.by_severity
return "" if grouped.empty?

grouped
.sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
.map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
.join(", ")
end

grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
@io.puts("#{name}: #{issues.size}")
issues.each do |issue|
@io.puts(" - #{offense(issue)}")
def report_summary(issues)
@io.puts("Summary:")

issues.by_category
.sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
.each do |category, category_issues|
@io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}")
end
end

def report_issue_groups(issues, base_url:)
grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }

grouped
.sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
.each do |(category, code), grouped_issues|
@io.puts("#{category} / #{code}: #{grouped_issues.size}")

grouped_issues.each do |issue|
@io.puts(" - #{compact_issue(issue, base_url: base_url)}")
end

@io.puts("")
end
end

def compact_issue(issue, base_url:)
parts = []
parts << relative_url(issue.url, base_url: base_url) if issue.url

detail = compact_detail(issue, base_url: base_url)
parts << detail unless detail.empty?

parts.compact.join(" ")
end

def compact_detail(issue, base_url:)
details = issue.details || {}
fragments = []

inbound = details[:dofollow_inbound_count] || details[:inbound_count]
fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]

if details[:ratio] && details[:threshold]
fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
end

fragments << "count #{details[:count]}" if details[:count]
fragments << "length #{details[:length]}" if details[:length]
fragments << "status #{details[:status]}" if details[:status]
fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?

return issue.message if fragments.empty?

case issue.code
when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
fragments.join(" ")
else
([issue.message] + fragments).join(" ")
end
end

def offense(issue)
parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
parts.compact.join(" ")
def relative_urls(urls, base_url:)
Array(urls).map { |url| relative_url(url, base_url: base_url) }
end

def relative_url(url, base_url:)
return url unless url && base_url

uri = URI.parse(url)
base_uri = URI.parse(base_url)

return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port

relative = uri.path.to_s.empty? ? "/" : uri.path
relative += "?#{uri.query}" if uri.query
relative += "##{uri.fragment}" if uri.fragment
relative
rescue URI::InvalidURIError
url
end

def format_number(value)
return format("%.2f", value) if value.is_a?(Float)

value.to_s
end

def pluralize(word, count)
return word.to_s if count == 1

"#{word}s"
end
end
end
2 changes: 1 addition & 1 deletion lib/crawlscope/rules/content_quality.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def validate_unique_token_ratio(page, issues)
end

def validate_visible_text_ratio(page, issues)
html_bytes = DocumentText.html_for(page.doc).bytesize
html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
return if html_bytes.zero?

visible_text = DocumentText.text_for(page.doc)
Expand Down
34 changes: 28 additions & 6 deletions lib/crawlscope/rules/indexability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,31 @@ class Indexability
ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
X_ROBOTS_TAG_HEADER = "x-robots-tag"

def self.noindex_header?(headers)
noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
end

def self.noindex_meta?(doc)
return false unless doc

doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
end

def self.header_value(headers, name)
headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
end

def self.directives(value)
value
.split(",")
.map { |directive| directive.split(":", 2).last.to_s.strip }
.reject(&:empty?)
end

def self.noindex?(value)
directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
end

attr_reader :code

def initialize
Expand All @@ -28,18 +53,15 @@ def normalized_sitemap_urls(urls)
end

def header_value(page, name)
page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
self.class.header_value(page.headers, name)
end

def directives(value)
value
.split(",")
.map { |directive| directive.split(":", 2).last.to_s.strip }
.reject(&:empty?)
self.class.directives(value)
end

def noindex?(value)
directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
self.class.noindex?(value)
end

def follow?(value)
Expand Down
6 changes: 6 additions & 0 deletions lib/crawlscope/rules/links.rb
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ def html?
resolution && resolution[:html]
end

def noindex?
Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
end

def status
resolution && resolution[:status]
end
Expand Down Expand Up @@ -421,6 +426,7 @@ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)

target = resolve_target(final_url)
next unless target.allowed?(@allowed_statuses) && target.html?
next if target.noindex?

reported_urls << final_url

Expand Down
18 changes: 18 additions & 0 deletions test/crawlscope/content_quality_rule_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ def test_visible_text_ratio_ignores_markup_outside_main_content
refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
end

def test_visible_text_ratio_ignores_form_payload_markup
issues = Crawlscope::IssueCollection.new
page = page_with(
main: <<~HTML
<p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
<form>
<div data-select-autocomplete-options-value="#{"x" * 50_000}">
<input type="text" name="country">
</div>
</form>
HTML
)

Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)

refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
end

def test_reports_low_unique_token_ratio_for_repetitive_content
issues = Crawlscope::IssueCollection.new
page = page_with(main: ("hotel location service " * 100).strip)
Expand Down
48 changes: 48 additions & 0 deletions test/crawlscope/links_rule_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,54 @@ def test_reports_indexable_internal_pages_missing_from_sitemap
assert_equal "https://example.com/hidden", issue.url
end

def test_does_not_report_noindex_internal_pages_missing_from_sitemap
issues = Crawlscope::IssueCollection.new
resolver = lambda do |target_url|
{
crawled: false,
doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
error: nil,
final_url: target_url,
headers: {},
html: true,
status: 200
}
end

Crawlscope::Rules::Links.new.call(
urls: ["https://example.com/guide"],
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
issues: issues,
context: context(resolver: resolver)
)

refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
end

def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
issues = Crawlscope::IssueCollection.new
resolver = lambda do |target_url|
{
crawled: false,
doc: Nokogiri::HTML("<main>Hidden</main>"),
error: nil,
final_url: target_url,
headers: {"X-Robots-Tag" => "noindex"},
html: true,
status: 200
}
end

Crawlscope::Rules::Links.new.call(
urls: ["https://example.com/guide"],
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
issues: issues,
context: context(resolver: resolver)
)

refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
end

def test_reports_url_hygiene_issues
issues = Crawlscope::IssueCollection.new
long_path = "a" * 2_050
Expand Down
Loading
Loading