diff --git a/README.md b/README.md
index bb6828b..b53a20d 100644
--- a/README.md
+++ b/README.md
@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
```bash
bundle exec rake crawlscope:validate URL=https://example.com
+bundle exec rake 'crawlscope:validate[https://example.com]'
bundle exec rake crawlscope:validate:metadata URL=https://example.com
+bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
+bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
```
`crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
+Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
+task-argument form above instead.
+
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
### Structured Data URL Audit
diff --git a/Rakefile b/Rakefile
index eac8d3d..840ad74 100644
--- a/Rakefile
+++ b/Rakefile
@@ -183,35 +183,35 @@ namespace :release do
end
namespace :crawlscope do
- desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
- task :validate do
- Crawlscope::RakeTasks.validate
+ desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+ task :validate, [:url, :sitemap, :rules] do |_task, args|
+ Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
end
namespace :validate do
- desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
- task :ldjson do
- Crawlscope::RakeTasks.ldjson
+ desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+ task :ldjson, [:url] do |_task, args|
+ Crawlscope::RakeTasks.ldjson(urls: args[:url])
end
- desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
- task :metadata do
- Crawlscope::RakeTasks.validate_rule("metadata")
+ desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+ task :metadata, [:url, :sitemap] do |_task, args|
+ Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
end
- desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
- task :structured_data do
- Crawlscope::RakeTasks.validate_rule("structured_data")
+ desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+ task :structured_data, [:url, :sitemap] do |_task, args|
+ Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
end
- desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
- task :uniqueness do
- Crawlscope::RakeTasks.validate_rule("uniqueness")
+ desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+ task :uniqueness, [:url, :sitemap] do |_task, args|
+ Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
end
- desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
- task :links do
- Crawlscope::RakeTasks.validate_rule("links")
+ desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+ task :links, [:url, :sitemap] do |_task, args|
+ Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
end
end
end
diff --git a/docs/ahrefs-check-candidates.md b/docs/ahrefs-check-candidates.md
new file mode 100644
index 0000000..edab201
--- /dev/null
+++ b/docs/ahrefs-check-candidates.md
@@ -0,0 +1,236 @@
+# Ahrefs Issue Crosswalk
+
+This crosswalk compares the Ahrefs issue list against Crawlscope's current
+rules and identifies checks that fit the gem's sitemap-driven, internal-audit
+scope.
+
+## Current Coverage
+
+- `noindex_meta`, `noindex_header`: covers "Noindex page" via meta robots and
+ `X-Robots-Tag`.
+- `redirected_page`: covers crawled sitemap URLs that redirect.
+- `internal_link_redirects`: covers "Page has links to redirect".
+- `low_inbound_anchor_links`: partly covers "Page has only one dofollow
+ incoming internal link", but currently does not distinguish dofollow from
+ nofollow.
+- `title_too_long`: covers "Title too long".
+- `meta_description_too_short`, `meta_description_too_long`: covers meta
+ description length checks.
+- `missing_title`, `missing_meta_description`, `missing_h1`, `multiple_h1`,
+ `missing_canonical`, `canonical_mismatch`, and
+ `incomplete_open_graph_tags`: covers adjacent metadata checks not present in
+ the pasted Ahrefs list.
+- `structured_data_parse_error`, `structured_data_schema_error`: covers
+ schema.org-style validation errors.
+- `missing_structured_data`, `missing_job_posting`, `multiple_job_postings`:
+ covers structured-data presence and domain-specific JobPosting rules.
+- `duplicate_title`, `duplicate_meta_description`,
+ `duplicate_content_fingerprint`, `near_duplicate_content`: covers uniqueness
+ checks beyond the pasted Ahrefs list.
+- `thin_visible_text`, `low_visible_text_ratio`, `low_unique_token_ratio`:
+ covers content-quality checks beyond the pasted Ahrefs list.
+- `unexpected_status`, `fetch_failed`, `broken_internal_link`: covers status
+ and broken-link checks beyond the pasted Ahrefs list.
+
+## Candidate Checks To Add
+
+### High Fit
+
+1. `nofollow_meta`
+ - Ahrefs match: "Nofollow page".
+ - Detect `nofollow` in meta robots and `X-Robots-Tag`.
+ - This should share parsing with the existing noindex checks so
+ `noindex,nofollow`, `none`, and scoped directives are handled consistently.
+
+2. `noindex_follow_meta` / `noindex_follow_header`
+ - Ahrefs match: "Noindex follow page".
+ - Detect pages that explicitly combine `noindex` with `follow`.
+ - This is useful because current Crawlscope reports noindex but does not
+ classify whether internal links remain followable.
+
+3. `noindex_nofollow_meta` / `noindex_nofollow_header`
+ - Ahrefs match: "Noindex and nofollow page".
+ - Detect pages that block indexing and link following.
+ - Could also treat `none` as equivalent to `noindex,nofollow`.
+
+4. `canonical_no_internal_inlinks`
+ - Ahrefs match: "Canonical URL has no incoming internal links".
+ - For each sitemap URL, resolve its canonical target and count internal
+ links to the canonical URL/path.
+ - This is more precise than `low_inbound_anchor_links`, which counts links
+ to the crawled URL/final path rather than canonical target.
+
+5. `nofollow_internal_outlinks`
+ - Ahrefs match: "Page has nofollow outgoing internal links".
+ - Extend link extraction to retain `rel` attributes and flag internal
+ anchors with `rel~="nofollow"`.
+ - This also unlocks incoming nofollow/dofollow mix checks.
+
+6. `only_nofollow_internal_inlinks`
+ - Ahrefs match: "Page has nofollow incoming internal links only".
+ - Count incoming internal links by follow state.
+ - Report when a sitemap URL has at least one internal inlink but zero
+ dofollow internal inlinks.
+
+7. `mixed_follow_internal_inlinks`
+ - Ahrefs match: "Page has nofollow and dofollow incoming internal links".
+ - Count incoming internal links by follow state.
+ - Report when both counts are positive, with source samples for each class.
+
+8. `low_dofollow_inlinks`
+ - Ahrefs match: "Page has only one dofollow incoming internal link".
+ - Replace or supplement `low_inbound_anchor_links` with a dofollow-specific
+ count.
+ - Make the threshold configurable; Ahrefs surfaces "only one", but a host
+ app may want `minimum_dofollow_inlinks = 2`.
+
+9. `indexable_page_missing_from_sitemap`
+ - Ahrefs match: "Indexable page not in sitemap".
+ - Use discovered internal links to find crawlable, indexable HTML pages that
+ are not present in the sitemap.
+ - This requires Crawlscope to keep discovered internal targets, not only
+ sitemap URLs, so it is a larger links/crawl contract change.
+
+10. `sitemap_noindex_url`
+ - Ahrefs adjacent documented issue: "Noindex page in sitemap".
+ - Current noindex checks run on sitemap pages, but this issue name would
+ make the sitemap/indexability conflict explicit.
+ - It can be implemented as a second issue emitted from the indexability
+ rule, or as a dedicated sitemap consistency rule.
+
+11. `sitemap_redirect_url`
+ - Ahrefs match: "3XX redirect" and adjacent "3xx redirect in sitemap".
+ - Current `redirected_page` covers this behavior; adding this alias/code
+ would make reports match external audit tools more directly.
+
+12. `http_internal_link`
+ - Ahrefs adjacent issue: "HTTPS page has internal links to HTTP".
+ - Detect internal links from HTTPS pages to HTTP URLs on the same host.
+ - This is more actionable than relying on redirect detection after fetch.
+
+13. `canonical_points_to_redirect`
+ - Ahrefs adjacent issue: "Canonical points to redirect".
+ - Resolve canonical targets and flag canonical URLs that redirect.
+ - This fits the existing canonical metadata rule but needs target
+ resolution similar to the links rule.
+
+14. `canonical_points_to_error`
+ - Ahrefs adjacent issues: "Canonical points to 4XX" and "Canonical points
+ to 5XX".
+ - Resolve canonical targets and flag non-success responses.
+
+### Medium Fit
+
+15. `multiple_title_tags`
+ - Ahrefs documented issue: "Multiple title tags".
+ - Current metadata rule checks missing/long/repeated title, but not
+ multiple `
` elements.
+
+16. `multiple_meta_descriptions`
+ - Ahrefs documented issue: "Multiple meta description tags".
+ - Current metadata rule checks missing/short/long description, but not
+ duplicate description tags on the same page.
+
+17. `empty_h1`
+ - Ahrefs documented issue: "H1 tag missing or empty".
+ - Current `missing_h1` only checks the element count. An empty `
` should
+ be reported separately or treated as missing.
+
+18. `page_has_no_outgoing_links`
+ - Ahrefs documented issue: "Page has no outgoing links".
+ - Detect indexable HTML pages with zero meaningful outgoing anchors.
+ - The rule should ignore skipped Rails/CDN/mail/tel links consistently with
+ the current links rule.
+
+19. `orphan_page`
+ - Ahrefs documented issue: "Orphan page".
+ - Similar to `canonical_no_internal_inlinks`, but for page URL rather than
+ final canonical URL.
+ - This overlaps with a configurable `low_dofollow_inlinks` threshold of
+ one, so avoid double-reporting.
+
+20. `non_canonical_page_in_sitemap`
+ - Ahrefs documented issue: "Non-canonical page in sitemap".
+ - Current `canonical_mismatch` flags the page-level mismatch; this new code
+ would explicitly state the sitemap contract violation.
+
+21. `duplicate_pages_without_canonical`
+ - Ahrefs documented issue: "Duplicate pages without canonical".
+ - Current uniqueness rules find duplicate content/title/description. This
+ would connect duplicate clusters to canonical presence and consistency.
+
+22. `url_double_slash`
+ - Ahrefs documented issue: "Double slash in URL".
+ - Detect sitemap or internal-link URLs whose path contains accidental
+ double slashes.
+ - Low implementation cost, but lower impact than link/indexability checks.
+
+23. `url_too_long`
+ - Ahrefs Page Explorer exposes URL length.
+ - Useful as a notice-level metadata/url hygiene check.
+
+24. `structured_data_missing_type`
+ - Ahrefs structured-data docs call out missing `@type`.
+ - Current schema validation may catch this for registered schemas, but a
+ generic issue code would make raw schema.org failures easier to act on.
+
+25. `structured_data_invalid_type`
+ - Ahrefs structured-data docs call out invalid schema types.
+ - Could be implemented only if Crawlscope adopts or vendors a schema.org
+ vocabulary source; otherwise keep using configured JSON schemas.
+
+26. `structured_data_invalid_property`
+ - Ahrefs structured-data docs call out invalid schema properties.
+ - Same caveat as invalid type: this needs schema.org vocabulary validation,
+ not only local JSON schemas.
+
+### Lower Fit Or Requires External Data
+
+27. `noindex_page_became_indexable`
+ - Ahrefs match: "Noindex page became indexable".
+ - Requires historical crawl snapshots. Not a good fit until Crawlscope has
+ persistence or report comparison.
+
+28. `h1_changed`, `meta_description_changed`, `title_tag_changed`,
+ `serp_title_changed`
+ - Ahrefs match: changed-content issues.
+ - Requires prior crawl snapshots or external SERP data. Not a current
+ stateless gem fit.
+
+29. `page_and_serp_titles_do_not_match`
+ - Ahrefs match: SERP title mismatch.
+ - Requires search-result data. Out of scope for a deterministic sitemap
+ crawler unless a host app injects SERP observations.
+
+30. `pages_added_to_sitemaps`
+ - Ahrefs match: "Pages added to sitemaps".
+ - Requires comparing current sitemap URLs with a prior crawl.
+
+31. `pages_to_submit_to_indexnow`
+ - Ahrefs match: "Pages to submit to IndexNow".
+ - Requires change detection and an IndexNow integration. Better as a host
+ app workflow than a default Crawlscope rule.
+
+32. `google_rich_results_validation_error`
+ - Ahrefs match: "Structured data has Google rich results validation error".
+ - Crawlscope can validate local schema contracts today, but Google rich
+ result validation requires Google-specific rule coverage and may diverge
+ from schema.org validation. Add only if the gem owns those feature
+ schemas explicitly.
+
+## Recommended First Batch
+
+1. Add a shared robots directive parser and emit `nofollow_meta`,
+ `noindex_follow_*`, and `noindex_nofollow_*`.
+2. Extend link extraction with follow state and emit `nofollow_internal_outlinks`,
+ `only_nofollow_internal_inlinks`, `mixed_follow_internal_inlinks`, and
+ `low_dofollow_inlinks`.
+3. Add canonical target checks:
+ `canonical_no_internal_inlinks`, `canonical_points_to_redirect`, and
+ `canonical_points_to_error`.
+4. Add sitemap consistency aliases for already-observed conditions:
+ `sitemap_noindex_url`, `sitemap_redirect_url`, and
+ `non_canonical_page_in_sitemap`.
+5. Add simple metadata hygiene checks:
+ `multiple_title_tags`, `multiple_meta_descriptions`, and `empty_h1`.
+
diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb
index 41c01bc..39f9ac3 100644
--- a/lib/crawlscope/cli.rb
+++ b/lib/crawlscope/cli.rb
@@ -37,11 +37,14 @@ def call
@err.puts(general_usage)
1
end
- rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
@err.puts(error.message)
@err.puts("")
@err.puts(general_usage)
1
+ rescue ValidationError => error
+ @err.puts(error.message)
+ 1
end
private
diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
index f32fdef..c0b7374 100644
--- a/lib/crawlscope/crawl.rb
+++ b/lib/crawlscope/crawl.rb
@@ -83,6 +83,7 @@ def collect(pages, issues)
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
elsif redirected?(page)
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
+ issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
end
end
end
@@ -127,6 +128,7 @@ def resolution(page, normalized_url, crawled:)
crawled: crawled,
error: page.error,
final_url: page.normalized_final_url || normalized_url,
+ html: page.html?,
status: page.status
}
end
diff --git a/lib/crawlscope/rake_tasks.rb b/lib/crawlscope/rake_tasks.rb
index 5acdbae..e3e20be 100644
--- a/lib/crawlscope/rake_tasks.rb
+++ b/lib/crawlscope/rake_tasks.rb
@@ -4,25 +4,40 @@ module Crawlscope
module RakeTasks
module_function
- def validate
- run("validate")
+ def validate(url: nil, sitemap_path: nil, rule_names: nil)
+ run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
end
- def ldjson
- run("ldjson")
+ def ldjson(urls: nil)
+ run("ldjson", argv: ldjson_argv(urls: urls))
end
- def validate_rule(rule)
- original_rules = ENV["RULES"]
- ENV["RULES"] = rule
- validate
- ensure
- ENV["RULES"] = original_rules
+ def validate_rule(rule, url: nil, sitemap_path: nil)
+ validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
end
- def run(command)
- status = Cli.start([command], out: $stdout, err: $stderr)
+ def run(command, argv: [])
+ status = Cli.start([command, *argv], out: $stdout, err: $stderr)
exit(status) unless status.zero?
end
+
+ def validate_argv(url:, sitemap_path:, rule_names:)
+ [
+ option_pair("--url", url),
+ option_pair("--sitemap", sitemap_path),
+ option_pair("--rules", rule_names)
+ ].compact.flatten
+ end
+
+ def ldjson_argv(urls:)
+ Array(urls).flat_map { |url| option_pair("--url", url) }.compact
+ end
+
+ def option_pair(name, value)
+ value = value.to_s.strip
+ return if value.empty?
+
+ [name, value]
+ end
end
end
diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb
index 48dde93..d159aa9 100644
--- a/lib/crawlscope/reporter.rb
+++ b/lib/crawlscope/reporter.rb
@@ -20,14 +20,29 @@ def report(result)
@io.puts("Status: FAILED")
@io.puts("Issues: #{result.issues.size}")
+ @io.puts("")
- result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
- @io.puts("#{severity}: #{issues.size}")
- end
+ report_grouped_issues("Severity", result.issues.by_severity)
+ @io.puts("")
+ report_grouped_issues("Category", result.issues.by_category)
+ end
+
+ private
- result.issues.each do |issue|
- @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
+ def report_grouped_issues(title, grouped_issues)
+ @io.puts("#{title}:")
+
+ grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
+ @io.puts("#{name}: #{issues.size}")
+ issues.each do |issue|
+ @io.puts(" - #{offense(issue)}")
+ end
end
end
+
+ def offense(issue)
+ parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
+ parts.compact.join(" ")
+ end
end
end
diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb
index 114d030..06aae36 100644
--- a/lib/crawlscope/rules/indexability.rb
+++ b/lib/crawlscope/rules/indexability.rb
@@ -13,45 +13,110 @@ def initialize
end
def call(urls:, pages:, issues:, context: nil)
+ sitemap_urls = normalized_sitemap_urls(urls)
+
pages.each do |page|
- validate_meta_robots(page, issues) if page.html?
- validate_x_robots_tag(page, issues)
+ validate_meta_robots(page, issues, sitemap_urls) if page.html?
+ validate_x_robots_tag(page, issues, sitemap_urls)
end
end
private
+ def normalized_sitemap_urls(urls)
+ urls.map { |url| Url.normalize(url, base_url: url) }.compact
+ end
+
def header_value(page, name)
page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
end
- def noindex?(value)
+ def directives(value)
value
.split(",")
.map { |directive| directive.split(":", 2).last.to_s.strip }
- .any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+ .reject(&:empty?)
+ end
+
+ def noindex?(value)
+ directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+ end
+
+ def follow?(value)
+ directives(value).any? { |directive| directive.casecmp?("follow") }
end
- def validate_meta_robots(page, issues)
+ def nofollow?(value)
+ directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") }
+ end
+
+ def validate_meta_robots(page, issues, sitemap_urls)
page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
content = tag["content"].to_s
- next unless noindex?(content)
-
- issues.add(
- code: :noindex_meta,
- severity: :error,
- category: :indexability,
- url: page.url,
- message: "robots meta tag prevents indexing",
- details: {content: content, name: tag["name"].to_s}
- )
+
+ report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content)
+ report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content)
+ report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content)
+ report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content)
end
end
- def validate_x_robots_tag(page, issues)
+ def validate_x_robots_tag(page, issues, sitemap_urls)
content = header_value(page, X_ROBOTS_TAG_HEADER)
- return unless noindex?(content)
+ return if content.empty?
+
+ report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content)
+ report_nofollow_header(page, issues, content) if nofollow?(content)
+ report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content)
+ report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content)
+ end
+ def report_noindex_meta(page, issues, content, name, sitemap_urls)
+ issues.add(
+ code: :noindex_meta,
+ severity: :error,
+ category: :indexability,
+ url: page.url,
+ message: "robots meta tag prevents indexing",
+ details: {content: content, name: name}
+ )
+ report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls)
+ end
+
+ def report_nofollow_meta(page, issues, content, name)
+ issues.add(
+ code: :nofollow_meta,
+ severity: :warning,
+ category: :indexability,
+ url: page.url,
+ message: "robots meta tag prevents following links",
+ details: {content: content, name: name}
+ )
+ end
+
+ def report_noindex_follow_meta(page, issues, content, name)
+ issues.add(
+ code: :noindex_follow_meta,
+ severity: :warning,
+ category: :indexability,
+ url: page.url,
+ message: "robots meta tag prevents indexing but allows following links",
+ details: {content: content, name: name}
+ )
+ end
+
+ def report_noindex_nofollow_meta(page, issues, content, name)
+ issues.add(
+ code: :noindex_nofollow_meta,
+ severity: :error,
+ category: :indexability,
+ url: page.url,
+ message: "robots meta tag prevents indexing and following links",
+ details: {content: content, name: name}
+ )
+ end
+
+ def report_noindex_header(page, issues, content, sitemap_urls)
issues.add(
code: :noindex_header,
severity: :error,
@@ -60,6 +125,54 @@ def validate_x_robots_tag(page, issues)
message: "X-Robots-Tag header prevents indexing",
details: {content: content}
)
+ report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls)
+ end
+
+ def report_nofollow_header(page, issues, content)
+ issues.add(
+ code: :nofollow_header,
+ severity: :warning,
+ category: :indexability,
+ url: page.url,
+ message: "X-Robots-Tag header prevents following links",
+ details: {content: content}
+ )
+ end
+
+ def report_noindex_follow_header(page, issues, content)
+ issues.add(
+ code: :noindex_follow_header,
+ severity: :warning,
+ category: :indexability,
+ url: page.url,
+ message: "X-Robots-Tag header prevents indexing but allows following links",
+ details: {content: content}
+ )
+ end
+
+ def report_noindex_nofollow_header(page, issues, content)
+ issues.add(
+ code: :noindex_nofollow_header,
+ severity: :error,
+ category: :indexability,
+ url: page.url,
+ message: "X-Robots-Tag header prevents indexing and following links",
+ details: {content: content}
+ )
+ end
+
+ def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:)
+ normalized_url = Url.normalize(page.url, base_url: page.url)
+ return unless sitemap_urls.include?(normalized_url)
+
+ issues.add(
+ code: :sitemap_noindex_url,
+ severity: :error,
+ category: :sitemaps,
+ url: page.url,
+ message: "sitemap URL is noindex",
+ details: {content: content, source: source}
+ )
end
end
end
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
index a17915b..5cf4b27 100644
--- a/lib/crawlscope/rules/links.rb
+++ b/lib/crawlscope/rules/links.rb
@@ -10,6 +10,7 @@ class Links
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
MAX_SOURCES_IN_ERROR = 3
MIN_INBOUND_ANCHOR_LINKS = 1
+ MIN_DOFOLLOW_INBOUND_LINKS = 2
attr_reader :code
@@ -24,10 +25,14 @@ def call(urls:, pages:, issues:, context:)
@base_host = URI.parse(@base_url).host
links = extract_links(pages)
- return if links.empty?
-
+ validate_url_hygiene(urls, links, issues)
resolved_links = resolve_links(links, issues)
+ validate_nofollow_outgoing_links(links, issues)
+ validate_http_internal_links(links, issues)
+ validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+ validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
validate_inbound_counts(urls, pages, resolved_links, issues)
+ validate_canonical_targets(urls, pages, resolved_links, issues)
end
private
@@ -64,6 +69,8 @@ def link_for(page:, source_path:, node:)
{
anchor_text: anchor_text,
+ http_internal_link: http_internal_link?(page.normalized_url, href),
+ nofollow: nofollow_link?(node),
source_path: source_path,
source_url: page.normalized_url,
target_path: target_path,
@@ -86,6 +93,19 @@ def normalize_anchor_text(text)
text.to_s.gsub(/\s+/, " ").strip
end
+ def nofollow_link?(node)
+ node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") }
+ end
+
+ def http_internal_link?(source_url, href)
+ source_uri = URI.parse(source_url.to_s)
+ target_uri = URI.parse(URI.join(source_url, href).to_s)
+
+ source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host
+ rescue URI::InvalidURIError
+ false
+ end
+
def normalize_internal_link(source_url, href)
absolute_url = URI.join(source_url, href).to_s
uri = URI.parse(absolute_url)
@@ -109,6 +129,36 @@ def report_broken_target(target_url, grouped_links, issues, status)
)
end
+ def validate_nofollow_outgoing_links(links, issues)
+ links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+ target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+
+ issues.add(
+ code: :nofollow_internal_outlinks,
+ severity: :warning,
+ category: :links,
+ url: source_url,
+ message: "page has nofollow outgoing internal links",
+ details: {target_urls: target_urls}
+ )
+ end
+ end
+
+ def validate_http_internal_links(links, issues)
+ links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+ target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+
+ issues.add(
+ code: :http_internal_link,
+ severity: :warning,
+ category: :links,
+ url: source_url,
+ message: "HTTPS page links to internal HTTP URL",
+ details: {target_urls: target_urls}
+ )
+ end
+ end
+
def report_unresolved_target(target_url, grouped_links, issues, resolution)
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
@@ -189,6 +239,10 @@ def ignored_error?
resolution && status.nil? && resolution[:crawled] && resolution[:error]
end
+ def html?
+ resolution && resolution[:html]
+ end
+
def status
resolution && resolution[:status]
end
@@ -221,6 +275,7 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
memo[path] = normalized_url
end
+ return if sitemap_paths.size < 2
html_paths = pages.each_with_object(Set.new) do |page, result|
next unless page.html?
@@ -235,7 +290,11 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
end
inbound_anchor_counts = Hash.new(0)
+ dofollow_inbound_counts = Hash.new(0)
+ nofollow_inbound_counts = Hash.new(0)
sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+ dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+ nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
resolved_links.each do |link|
target_path = link[:final_path]
@@ -245,24 +304,268 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
inbound_anchor_counts[target_path] += 1
source_samples = sample_sources_by_target[target_path]
source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
+
+ if link[:nofollow]
+ nofollow_inbound_counts[target_path] += 1
+ nofollow_sources = nofollow_sources_by_target[target_path]
+ nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url])
+ else
+ dofollow_inbound_counts[target_path] += 1
+ dofollow_sources = dofollow_sources_by_target[target_path]
+ dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url])
+ end
end
sitemap_paths.each do |path, target_url|
next unless html_paths.include?(path)
inbound_count = inbound_anchor_counts[path]
- next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
+ dofollow_count = dofollow_inbound_counts[path]
+ nofollow_count = nofollow_inbound_counts[path]
+
+ report_orphan_page(target_url, issues) if inbound_count.zero?
+
+ if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS
+ source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+ source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+
+ issues.add(
+ code: :low_inbound_anchor_links,
+ severity: :warning,
+ category: :links,
+ url: target_url,
+ message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
+ details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+ )
+ end
- source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
- source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+ report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues)
+ report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues)
+ report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues)
+ end
+ end
+
+ def validate_url_hygiene(urls, links, issues)
+ checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }
+ checked_urls.concat(links.map { |link| link[:target_url] })
+
+ checked_urls.compact.uniq.each do |url|
+ report_url_double_slash(url, issues)
+ report_url_too_long(url, issues)
+ end
+ end
+
+ def report_url_double_slash(url, issues)
+ path = URI.parse(url).path.to_s
+ return unless path.match?(%r{//+})
+
+ issues.add(
+ code: :url_double_slash,
+ severity: :notice,
+ category: :url,
+ url: url,
+ message: "URL path contains duplicate slashes",
+ details: {path: path}
+ )
+ rescue URI::InvalidURIError
+ nil
+ end
+
+ def report_url_too_long(url, issues)
+ return unless url.length > 2_048
+
+ issues.add(
+ code: :url_too_long,
+ severity: :notice,
+ category: :url,
+ url: url,
+ message: "URL too long (#{url.length})",
+ details: {length: url.length, maximum: 2_048}
+ )
+ end
+
+ def validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+ return if sitemap_urls.size < 2
+
+ source_paths_with_links = links.map { |link| link[:source_path] }.to_set
+
+ pages.each do |page|
+ next unless page.html?
+ next unless sitemap_urls.include?(page.normalized_url)
+
+ source_path = Url.path(page.normalized_url)
+ next unless crawlable_source_path?(source_path)
+ next if source_paths_with_links.include?(source_path)
issues.add(
- code: :low_inbound_anchor_links,
+ code: :page_has_no_outgoing_links,
severity: :warning,
category: :links,
- url: target_url,
- message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
- details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+ url: page.url,
+ message: "page has no outgoing internal links",
+ details: {}
+ )
+ end
+ end
+
+ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+ reported_urls = Set.new
+
+ resolved_links.each do |link|
+ final_url = link[:final_url]
+ next if sitemap_urls.include?(final_url)
+ next if reported_urls.include?(final_url)
+ next unless crawlable_path?(link[:final_path])
+
+ target = resolve_target(final_url)
+ next unless target.allowed?(@allowed_statuses) && target.html?
+
+ reported_urls << final_url
+
+ issues.add(
+ code: :indexable_page_missing_from_sitemap,
+ severity: :warning,
+ category: :sitemaps,
+ url: final_url,
+ message: "indexable internal page is missing from sitemap",
+ details: {source_url: link[:source_url]}
+ )
+ end
+ end
+
+ def report_orphan_page(target_url, issues)
+ issues.add(
+ code: :orphan_page,
+ severity: :warning,
+ category: :links,
+ url: target_url,
+ message: "page has no incoming internal links",
+ details: {}
+ )
+ end
+
+ def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues)
+ return if dofollow_count.zero?
+ return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS
+
+ source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+ source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+
+ issues.add(
+ code: :low_dofollow_inlinks,
+ severity: :warning,
+ category: :links,
+ url: target_url,
+ message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}",
+ details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples}
+ )
+ end
+
+ def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues)
+ return unless nofollow_count.positive? && dofollow_count.zero?
+
+ issues.add(
+ code: :only_nofollow_internal_inlinks,
+ severity: :warning,
+ category: :links,
+ url: target_url,
+ message: "page has nofollow incoming internal links only",
+ details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)}
+ )
+ end
+
+ def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues)
+ return unless nofollow_count.positive? && dofollow_count.positive?
+
+ issues.add(
+ code: :mixed_follow_internal_inlinks,
+ severity: :notice,
+ category: :links,
+ url: target_url,
+ message: "page has nofollow and dofollow incoming internal links",
+ details: {
+ dofollow_inbound_count: dofollow_count,
+ nofollow_inbound_count: nofollow_count,
+ dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR),
+ nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)
+ }
+ )
+ end
+
+ def validate_canonical_targets(urls, pages, resolved_links, issues)
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact
+ sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) }
+ return if sitemap_pages.size < 2
+
+ dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
+
+ sitemap_pages.each do |page|
+ canonical_url = canonical_url_for(page)
+ next if canonical_url.nil?
+
+ target_uri = URI.parse(canonical_url)
+ next if target_uri.host != @base_host
+
+ canonical_path = Url.path(canonical_url)
+ if canonical_path && dofollow_counts_by_path[canonical_path].zero?
+ issues.add(
+ code: :canonical_no_internal_inlinks,
+ severity: :warning,
+ category: :links,
+ url: canonical_url,
+ message: "canonical URL has no incoming internal links",
+ details: {source_url: page.url}
+ )
+ end
+
+ validate_canonical_target_status(page, canonical_url, issues)
+ rescue URI::InvalidURIError
+ next
+ end
+ end
+
+ def dofollow_counts_by_final_path(resolved_links)
+ resolved_links.each_with_object(Hash.new(0)) do |link, counts|
+ next if link[:nofollow]
+ next if link[:source_path] == link[:final_path]
+
+ counts[link[:final_path]] += 1
+ end
+ end
+
+ def canonical_url_for(page)
+ canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
+ return if canonical.empty?
+
+ Url.normalize(canonical, base_url: page.url)
+ end
+
+ def validate_canonical_target_status(page, canonical_url, issues)
+ target = resolve_target(canonical_url)
+
+ if target.unresolved? || target.ignored_error?
+ return
+ end
+
+ if target.redirect?
+ issues.add(
+ code: :canonical_points_to_redirect,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "canonical points to redirect",
+ details: {canonical: canonical_url, final_url: target.final_url, status: target.status}
+ )
+ elsif !target.allowed?(@allowed_statuses)
+ issues.add(
+ code: :canonical_points_to_error,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "canonical points to HTTP #{target.status}",
+ details: {canonical: canonical_url, status: target.status}
)
end
end
diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb
index 9f261f2..883ca58 100644
--- a/lib/crawlscope/rules/metadata.rb
+++ b/lib/crawlscope/rules/metadata.rb
@@ -18,22 +18,41 @@ def initialize(site_name: nil)
end
def call(urls:, pages:, issues:, context: nil)
+ sitemap_urls = normalized_sitemap_urls(urls)
+
pages.each do |page|
next unless page.html?
validate_h1(page, issues)
validate_title(page, issues)
validate_description(page, issues)
- validate_canonical(page, issues)
+ validate_canonical(page, issues, sitemap_urls)
validate_open_graph(page, issues)
end
end
private
+ def normalized_sitemap_urls(urls)
+ urls.map { |url| Url.normalize(url, base_url: url) }.compact
+ end
+
def validate_h1(page, issues)
h1s = page.doc.css("h1")
- return if h1s.one?
+ empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? }
+
+ if empty_h1s.any?
+ issues.add(
+ code: :empty_h1,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "empty