diff --git a/README.md b/README.md index bb6828b..b53a20d 100644 --- a/README.md +++ b/README.md @@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu ```bash bundle exec rake crawlscope:validate URL=https://example.com +bundle exec rake 'crawlscope:validate[https://example.com]' bundle exec rake crawlscope:validate:metadata URL=https://example.com +bundle exec rake 'crawlscope:validate:metadata[https://example.com]' bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article +bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]' ``` `crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path. +Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the +task-argument form above instead. + `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`. ### Structured Data URL Audit diff --git a/Rakefile b/Rakefile index eac8d3d..840ad74 100644 --- a/Rakefile +++ b/Rakefile @@ -183,35 +183,35 @@ namespace :release do end namespace :crawlscope do - desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" - task :validate do - Crawlscope::RakeTasks.validate + desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" + task :validate, [:url, :sitemap, :rules] do |_task, args| + Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules]) end namespace :validate do - desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" - task :ldjson do - Crawlscope::RakeTasks.ldjson + desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" + task :ldjson, [:url] do |_task, args| + Crawlscope::RakeTasks.ldjson(urls: args[:url]) end - desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1" - task :metadata do - Crawlscope::RakeTasks.validate_rule("metadata") + desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :metadata, [:url, :sitemap] do |_task, args| + Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1" - task :structured_data do - Crawlscope::RakeTasks.validate_rule("structured_data") + desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :structured_data, [:url, :sitemap] do |_task, args| + Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1" - task :uniqueness do - Crawlscope::RakeTasks.validate_rule("uniqueness") + desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :uniqueness, [:url, :sitemap] do |_task, args| + Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1" - task :links do - Crawlscope::RakeTasks.validate_rule("links") + desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :links, [:url, :sitemap] do |_task, args| + Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap]) end end end diff --git a/docs/ahrefs-check-candidates.md b/docs/ahrefs-check-candidates.md new file mode 100644 index 0000000..edab201 --- /dev/null +++ b/docs/ahrefs-check-candidates.md @@ -0,0 +1,236 @@ +# Ahrefs Issue Crosswalk + +This crosswalk compares the Ahrefs issue list against Crawlscope's current +rules and identifies checks that fit the gem's sitemap-driven, internal-audit +scope. + +## Current Coverage + +- `noindex_meta`, `noindex_header`: covers "Noindex page" via meta robots and + `X-Robots-Tag`. +- `redirected_page`: covers crawled sitemap URLs that redirect. +- `internal_link_redirects`: covers "Page has links to redirect". +- `low_inbound_anchor_links`: partly covers "Page has only one dofollow + incoming internal link", but currently does not distinguish dofollow from + nofollow. +- `title_too_long`: covers "Title too long". +- `meta_description_too_short`, `meta_description_too_long`: covers meta + description length checks. +- `missing_title`, `missing_meta_description`, `missing_h1`, `multiple_h1`, + `missing_canonical`, `canonical_mismatch`, and + `incomplete_open_graph_tags`: covers adjacent metadata checks not present in + the pasted Ahrefs list. +- `structured_data_parse_error`, `structured_data_schema_error`: covers + schema.org-style validation errors. +- `missing_structured_data`, `missing_job_posting`, `multiple_job_postings`: + covers structured-data presence and domain-specific JobPosting rules. +- `duplicate_title`, `duplicate_meta_description`, + `duplicate_content_fingerprint`, `near_duplicate_content`: covers uniqueness + checks beyond the pasted Ahrefs list. +- `thin_visible_text`, `low_visible_text_ratio`, `low_unique_token_ratio`: + covers content-quality checks beyond the pasted Ahrefs list. +- `unexpected_status`, `fetch_failed`, `broken_internal_link`: covers status + and broken-link checks beyond the pasted Ahrefs list. + +## Candidate Checks To Add + +### High Fit + +1. `nofollow_meta` + - Ahrefs match: "Nofollow page". + - Detect `nofollow` in meta robots and `X-Robots-Tag`. + - This should share parsing with the existing noindex checks so + `noindex,nofollow`, `none`, and scoped directives are handled consistently. + +2. `noindex_follow_meta` / `noindex_follow_header` + - Ahrefs match: "Noindex follow page". + - Detect pages that explicitly combine `noindex` with `follow`. + - This is useful because current Crawlscope reports noindex but does not + classify whether internal links remain followable. + +3. `noindex_nofollow_meta` / `noindex_nofollow_header` + - Ahrefs match: "Noindex and nofollow page". + - Detect pages that block indexing and link following. + - Could also treat `none` as equivalent to `noindex,nofollow`. + +4. `canonical_no_internal_inlinks` + - Ahrefs match: "Canonical URL has no incoming internal links". + - For each sitemap URL, resolve its canonical target and count internal + links to the canonical URL/path. + - This is more precise than `low_inbound_anchor_links`, which counts links + to the crawled URL/final path rather than canonical target. + +5. `nofollow_internal_outlinks` + - Ahrefs match: "Page has nofollow outgoing internal links". + - Extend link extraction to retain `rel` attributes and flag internal + anchors with `rel~="nofollow"`. + - This also unlocks incoming nofollow/dofollow mix checks. + +6. `only_nofollow_internal_inlinks` + - Ahrefs match: "Page has nofollow incoming internal links only". + - Count incoming internal links by follow state. + - Report when a sitemap URL has at least one internal inlink but zero + dofollow internal inlinks. + +7. `mixed_follow_internal_inlinks` + - Ahrefs match: "Page has nofollow and dofollow incoming internal links". + - Count incoming internal links by follow state. + - Report when both counts are positive, with source samples for each class. + +8. `low_dofollow_inlinks` + - Ahrefs match: "Page has only one dofollow incoming internal link". + - Replace or supplement `low_inbound_anchor_links` with a dofollow-specific + count. + - Make the threshold configurable; Ahrefs surfaces "only one", but a host + app may want `minimum_dofollow_inlinks = 2`. + +9. `indexable_page_missing_from_sitemap` + - Ahrefs match: "Indexable page not in sitemap". + - Use discovered internal links to find crawlable, indexable HTML pages that + are not present in the sitemap. + - This requires Crawlscope to keep discovered internal targets, not only + sitemap URLs, so it is a larger links/crawl contract change. + +10. `sitemap_noindex_url` + - Ahrefs adjacent documented issue: "Noindex page in sitemap". + - Current noindex checks run on sitemap pages, but this issue name would + make the sitemap/indexability conflict explicit. + - It can be implemented as a second issue emitted from the indexability + rule, or as a dedicated sitemap consistency rule. + +11. `sitemap_redirect_url` + - Ahrefs match: "3XX redirect" and adjacent "3xx redirect in sitemap". + - Current `redirected_page` covers this behavior; adding this alias/code + would make reports match external audit tools more directly. + +12. `http_internal_link` + - Ahrefs adjacent issue: "HTTPS page has internal links to HTTP". + - Detect internal links from HTTPS pages to HTTP URLs on the same host. + - This is more actionable than relying on redirect detection after fetch. + +13. `canonical_points_to_redirect` + - Ahrefs adjacent issue: "Canonical points to redirect". + - Resolve canonical targets and flag canonical URLs that redirect. + - This fits the existing canonical metadata rule but needs target + resolution similar to the links rule. + +14. `canonical_points_to_error` + - Ahrefs adjacent issues: "Canonical points to 4XX" and "Canonical points + to 5XX". + - Resolve canonical targets and flag non-success responses. + +### Medium Fit + +15. `multiple_title_tags` + - Ahrefs documented issue: "Multiple title tags". + - Current metadata rule checks missing/long/repeated title, but not + multiple `` elements. + +16. `multiple_meta_descriptions` + - Ahrefs documented issue: "Multiple meta description tags". + - Current metadata rule checks missing/short/long description, but not + duplicate description tags on the same page. + +17. `empty_h1` + - Ahrefs documented issue: "H1 tag missing or empty". + - Current `missing_h1` only checks the element count. An empty `<h1>` should + be reported separately or treated as missing. + +18. `page_has_no_outgoing_links` + - Ahrefs documented issue: "Page has no outgoing links". + - Detect indexable HTML pages with zero meaningful outgoing anchors. + - The rule should ignore skipped Rails/CDN/mail/tel links consistently with + the current links rule. + +19. `orphan_page` + - Ahrefs documented issue: "Orphan page". + - Similar to `canonical_no_internal_inlinks`, but for page URL rather than + final canonical URL. + - This overlaps with a configurable `low_dofollow_inlinks` threshold of + one, so avoid double-reporting. + +20. `non_canonical_page_in_sitemap` + - Ahrefs documented issue: "Non-canonical page in sitemap". + - Current `canonical_mismatch` flags the page-level mismatch; this new code + would explicitly state the sitemap contract violation. + +21. `duplicate_pages_without_canonical` + - Ahrefs documented issue: "Duplicate pages without canonical". + - Current uniqueness rules find duplicate content/title/description. This + would connect duplicate clusters to canonical presence and consistency. + +22. `url_double_slash` + - Ahrefs documented issue: "Double slash in URL". + - Detect sitemap or internal-link URLs whose path contains accidental + double slashes. + - Low implementation cost, but lower impact than link/indexability checks. + +23. `url_too_long` + - Ahrefs Page Explorer exposes URL length. + - Useful as a notice-level metadata/url hygiene check. + +24. `structured_data_missing_type` + - Ahrefs structured-data docs call out missing `@type`. + - Current schema validation may catch this for registered schemas, but a + generic issue code would make raw schema.org failures easier to act on. + +25. `structured_data_invalid_type` + - Ahrefs structured-data docs call out invalid schema types. + - Could be implemented only if Crawlscope adopts or vendors a schema.org + vocabulary source; otherwise keep using configured JSON schemas. + +26. `structured_data_invalid_property` + - Ahrefs structured-data docs call out invalid schema properties. + - Same caveat as invalid type: this needs schema.org vocabulary validation, + not only local JSON schemas. + +### Lower Fit Or Requires External Data + +27. `noindex_page_became_indexable` + - Ahrefs match: "Noindex page became indexable". + - Requires historical crawl snapshots. Not a good fit until Crawlscope has + persistence or report comparison. + +28. `h1_changed`, `meta_description_changed`, `title_tag_changed`, + `serp_title_changed` + - Ahrefs match: changed-content issues. + - Requires prior crawl snapshots or external SERP data. Not a current + stateless gem fit. + +29. `page_and_serp_titles_do_not_match` + - Ahrefs match: SERP title mismatch. + - Requires search-result data. Out of scope for a deterministic sitemap + crawler unless a host app injects SERP observations. + +30. `pages_added_to_sitemaps` + - Ahrefs match: "Pages added to sitemaps". + - Requires comparing current sitemap URLs with a prior crawl. + +31. `pages_to_submit_to_indexnow` + - Ahrefs match: "Pages to submit to IndexNow". + - Requires change detection and an IndexNow integration. Better as a host + app workflow than a default Crawlscope rule. + +32. `google_rich_results_validation_error` + - Ahrefs match: "Structured data has Google rich results validation error". + - Crawlscope can validate local schema contracts today, but Google rich + result validation requires Google-specific rule coverage and may diverge + from schema.org validation. Add only if the gem owns those feature + schemas explicitly. + +## Recommended First Batch + +1. Add a shared robots directive parser and emit `nofollow_meta`, + `noindex_follow_*`, and `noindex_nofollow_*`. +2. Extend link extraction with follow state and emit `nofollow_internal_outlinks`, + `only_nofollow_internal_inlinks`, `mixed_follow_internal_inlinks`, and + `low_dofollow_inlinks`. +3. Add canonical target checks: + `canonical_no_internal_inlinks`, `canonical_points_to_redirect`, and + `canonical_points_to_error`. +4. Add sitemap consistency aliases for already-observed conditions: + `sitemap_noindex_url`, `sitemap_redirect_url`, and + `non_canonical_page_in_sitemap`. +5. Add simple metadata hygiene checks: + `multiple_title_tags`, `multiple_meta_descriptions`, and `empty_h1`. + diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb index 41c01bc..39f9ac3 100644 --- a/lib/crawlscope/cli.rb +++ b/lib/crawlscope/cli.rb @@ -37,11 +37,14 @@ def call @err.puts(general_usage) 1 end - rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error + rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error @err.puts(error.message) @err.puts("") @err.puts(general_usage) 1 + rescue ValidationError => error + @err.puts(error.message) + 1 end private diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index f32fdef..c0b7374 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -83,6 +83,7 @@ def collect(pages, issues) issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status}) elsif redirected?(page) issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status}) + issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status}) end end end @@ -127,6 +128,7 @@ def resolution(page, normalized_url, crawled:) crawled: crawled, error: page.error, final_url: page.normalized_final_url || normalized_url, + html: page.html?, status: page.status } end diff --git a/lib/crawlscope/rake_tasks.rb b/lib/crawlscope/rake_tasks.rb index 5acdbae..e3e20be 100644 --- a/lib/crawlscope/rake_tasks.rb +++ b/lib/crawlscope/rake_tasks.rb @@ -4,25 +4,40 @@ module Crawlscope module RakeTasks module_function - def validate - run("validate") + def validate(url: nil, sitemap_path: nil, rule_names: nil) + run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names)) end - def ldjson - run("ldjson") + def ldjson(urls: nil) + run("ldjson", argv: ldjson_argv(urls: urls)) end - def validate_rule(rule) - original_rules = ENV["RULES"] - ENV["RULES"] = rule - validate - ensure - ENV["RULES"] = original_rules + def validate_rule(rule, url: nil, sitemap_path: nil) + validate(url: url, sitemap_path: sitemap_path, rule_names: rule) end - def run(command) - status = Cli.start([command], out: $stdout, err: $stderr) + def run(command, argv: []) + status = Cli.start([command, *argv], out: $stdout, err: $stderr) exit(status) unless status.zero? end + + def validate_argv(url:, sitemap_path:, rule_names:) + [ + option_pair("--url", url), + option_pair("--sitemap", sitemap_path), + option_pair("--rules", rule_names) + ].compact.flatten + end + + def ldjson_argv(urls:) + Array(urls).flat_map { |url| option_pair("--url", url) }.compact + end + + def option_pair(name, value) + value = value.to_s.strip + return if value.empty? + + [name, value] + end end end diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb index 48dde93..d159aa9 100644 --- a/lib/crawlscope/reporter.rb +++ b/lib/crawlscope/reporter.rb @@ -20,14 +20,29 @@ def report(result) @io.puts("Status: FAILED") @io.puts("Issues: #{result.issues.size}") + @io.puts("") - result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues| - @io.puts("#{severity}: #{issues.size}") - end + report_grouped_issues("Severity", result.issues.by_severity) + @io.puts("") + report_grouped_issues("Category", result.issues.by_category) + end + + private - result.issues.each do |issue| - @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}") + def report_grouped_issues(title, grouped_issues) + @io.puts("#{title}:") + + grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues| + @io.puts("#{name}: #{issues.size}") + issues.each do |issue| + @io.puts(" - #{offense(issue)}") + end end end + + def offense(issue) + parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message] + parts.compact.join(" ") + end end end diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb index 114d030..06aae36 100644 --- a/lib/crawlscope/rules/indexability.rb +++ b/lib/crawlscope/rules/indexability.rb @@ -13,45 +13,110 @@ def initialize end def call(urls:, pages:, issues:, context: nil) + sitemap_urls = normalized_sitemap_urls(urls) + pages.each do |page| - validate_meta_robots(page, issues) if page.html? - validate_x_robots_tag(page, issues) + validate_meta_robots(page, issues, sitemap_urls) if page.html? + validate_x_robots_tag(page, issues, sitemap_urls) end end private + def normalized_sitemap_urls(urls) + urls.map { |url| Url.normalize(url, base_url: url) }.compact + end + def header_value(page, name) page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s end - def noindex?(value) + def directives(value) value .split(",") .map { |directive| directive.split(":", 2).last.to_s.strip } - .any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + .reject(&:empty?) + end + + def noindex?(value) + directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") } + end + + def follow?(value) + directives(value).any? { |directive| directive.casecmp?("follow") } end - def validate_meta_robots(page, issues) + def nofollow?(value) + directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") } + end + + def validate_meta_robots(page, issues, sitemap_urls) page.doc.css(ROBOTS_META_SELECTOR).each do |tag| content = tag["content"].to_s - next unless noindex?(content) - - issues.add( - code: :noindex_meta, - severity: :error, - category: :indexability, - url: page.url, - message: "robots meta tag prevents indexing", - details: {content: content, name: tag["name"].to_s} - ) + + report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content) + report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content) + report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content) + report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content) end end - def validate_x_robots_tag(page, issues) + def validate_x_robots_tag(page, issues, sitemap_urls) content = header_value(page, X_ROBOTS_TAG_HEADER) - return unless noindex?(content) + return if content.empty? + + report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content) + report_nofollow_header(page, issues, content) if nofollow?(content) + report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content) + report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content) + end + def report_noindex_meta(page, issues, content, name, sitemap_urls) + issues.add( + code: :noindex_meta, + severity: :error, + category: :indexability, + url: page.url, + message: "robots meta tag prevents indexing", + details: {content: content, name: name} + ) + report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls) + end + + def report_nofollow_meta(page, issues, content, name) + issues.add( + code: :nofollow_meta, + severity: :warning, + category: :indexability, + url: page.url, + message: "robots meta tag prevents following links", + details: {content: content, name: name} + ) + end + + def report_noindex_follow_meta(page, issues, content, name) + issues.add( + code: :noindex_follow_meta, + severity: :warning, + category: :indexability, + url: page.url, + message: "robots meta tag prevents indexing but allows following links", + details: {content: content, name: name} + ) + end + + def report_noindex_nofollow_meta(page, issues, content, name) + issues.add( + code: :noindex_nofollow_meta, + severity: :error, + category: :indexability, + url: page.url, + message: "robots meta tag prevents indexing and following links", + details: {content: content, name: name} + ) + end + + def report_noindex_header(page, issues, content, sitemap_urls) issues.add( code: :noindex_header, severity: :error, @@ -60,6 +125,54 @@ def validate_x_robots_tag(page, issues) message: "X-Robots-Tag header prevents indexing", details: {content: content} ) + report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls) + end + + def report_nofollow_header(page, issues, content) + issues.add( + code: :nofollow_header, + severity: :warning, + category: :indexability, + url: page.url, + message: "X-Robots-Tag header prevents following links", + details: {content: content} + ) + end + + def report_noindex_follow_header(page, issues, content) + issues.add( + code: :noindex_follow_header, + severity: :warning, + category: :indexability, + url: page.url, + message: "X-Robots-Tag header prevents indexing but allows following links", + details: {content: content} + ) + end + + def report_noindex_nofollow_header(page, issues, content) + issues.add( + code: :noindex_nofollow_header, + severity: :error, + category: :indexability, + url: page.url, + message: "X-Robots-Tag header prevents indexing and following links", + details: {content: content} + ) + end + + def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:) + normalized_url = Url.normalize(page.url, base_url: page.url) + return unless sitemap_urls.include?(normalized_url) + + issues.add( + code: :sitemap_noindex_url, + severity: :error, + category: :sitemaps, + url: page.url, + message: "sitemap URL is noindex", + details: {content: content, source: source} + ) end end end diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index a17915b..5cf4b27 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -10,6 +10,7 @@ class Links LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze MAX_SOURCES_IN_ERROR = 3 MIN_INBOUND_ANCHOR_LINKS = 1 + MIN_DOFOLLOW_INBOUND_LINKS = 2 attr_reader :code @@ -24,10 +25,14 @@ def call(urls:, pages:, issues:, context:) @base_host = URI.parse(@base_url).host links = extract_links(pages) - return if links.empty? - + validate_url_hygiene(urls, links, issues) resolved_links = resolve_links(links, issues) + validate_nofollow_outgoing_links(links, issues) + validate_http_internal_links(links, issues) + validate_pages_with_no_outgoing_links(urls, pages, links, issues) + validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues) validate_inbound_counts(urls, pages, resolved_links, issues) + validate_canonical_targets(urls, pages, resolved_links, issues) end private @@ -64,6 +69,8 @@ def link_for(page:, source_path:, node:) { anchor_text: anchor_text, + http_internal_link: http_internal_link?(page.normalized_url, href), + nofollow: nofollow_link?(node), source_path: source_path, source_url: page.normalized_url, target_path: target_path, @@ -86,6 +93,19 @@ def normalize_anchor_text(text) text.to_s.gsub(/\s+/, " ").strip end + def nofollow_link?(node) + node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") } + end + + def http_internal_link?(source_url, href) + source_uri = URI.parse(source_url.to_s) + target_uri = URI.parse(URI.join(source_url, href).to_s) + + source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host + rescue URI::InvalidURIError + false + end + def normalize_internal_link(source_url, href) absolute_url = URI.join(source_url, href).to_s uri = URI.parse(absolute_url) @@ -109,6 +129,36 @@ def report_broken_target(target_url, grouped_links, issues, status) ) end + def validate_nofollow_outgoing_links(links, issues) + links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links| + target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR) + + issues.add( + code: :nofollow_internal_outlinks, + severity: :warning, + category: :links, + url: source_url, + message: "page has nofollow outgoing internal links", + details: {target_urls: target_urls} + ) + end + end + + def validate_http_internal_links(links, issues) + links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links| + target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR) + + issues.add( + code: :http_internal_link, + severity: :warning, + category: :links, + url: source_url, + message: "HTTPS page links to internal HTTP URL", + details: {target_urls: target_urls} + ) + end + end + def report_unresolved_target(target_url, grouped_links, issues, resolution) source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR) suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : "" @@ -189,6 +239,10 @@ def ignored_error? resolution && status.nil? && resolution[:crawled] && resolution[:error] end + def html? + resolution && resolution[:html] + end + def status resolution && resolution[:status] end @@ -221,6 +275,7 @@ def validate_inbound_counts(urls, pages, resolved_links, issues) memo[path] = normalized_url end + return if sitemap_paths.size < 2 html_paths = pages.each_with_object(Set.new) do |page, result| next unless page.html? @@ -235,7 +290,11 @@ def validate_inbound_counts(urls, pages, resolved_links, issues) end inbound_anchor_counts = Hash.new(0) + dofollow_inbound_counts = Hash.new(0) + nofollow_inbound_counts = Hash.new(0) sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] } + dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] } + nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] } resolved_links.each do |link| target_path = link[:final_path] @@ -245,24 +304,268 @@ def validate_inbound_counts(urls, pages, resolved_links, issues) inbound_anchor_counts[target_path] += 1 source_samples = sample_sources_by_target[target_path] source_samples << link[:source_url] unless source_samples.include?(link[:source_url]) + + if link[:nofollow] + nofollow_inbound_counts[target_path] += 1 + nofollow_sources = nofollow_sources_by_target[target_path] + nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url]) + else + dofollow_inbound_counts[target_path] += 1 + dofollow_sources = dofollow_sources_by_target[target_path] + dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url]) + end end sitemap_paths.each do |path, target_url| next unless html_paths.include?(path) inbound_count = inbound_anchor_counts[path] - next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS + dofollow_count = dofollow_inbound_counts[path] + nofollow_count = nofollow_inbound_counts[path] + + report_orphan_page(target_url, issues) if inbound_count.zero? + + if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS + source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR) + source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : "" + + issues.add( + code: :low_inbound_anchor_links, + severity: :warning, + category: :links, + url: target_url, + message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}", + details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples} + ) + end - source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR) - source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : "" + report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues) + report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues) + report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues) + end + end + + def validate_url_hygiene(urls, links, issues) + checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) } + checked_urls.concat(links.map { |link| link[:target_url] }) + + checked_urls.compact.uniq.each do |url| + report_url_double_slash(url, issues) + report_url_too_long(url, issues) + end + end + + def report_url_double_slash(url, issues) + path = URI.parse(url).path.to_s + return unless path.match?(%r{//+}) + + issues.add( + code: :url_double_slash, + severity: :notice, + category: :url, + url: url, + message: "URL path contains duplicate slashes", + details: {path: path} + ) + rescue URI::InvalidURIError + nil + end + + def report_url_too_long(url, issues) + return unless url.length > 2_048 + + issues.add( + code: :url_too_long, + severity: :notice, + category: :url, + url: url, + message: "URL too long (#{url.length})", + details: {length: url.length, maximum: 2_048} + ) + end + + def validate_pages_with_no_outgoing_links(urls, pages, links, issues) + sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set + return if sitemap_urls.size < 2 + + source_paths_with_links = links.map { |link| link[:source_path] }.to_set + + pages.each do |page| + next unless page.html? + next unless sitemap_urls.include?(page.normalized_url) + + source_path = Url.path(page.normalized_url) + next unless crawlable_source_path?(source_path) + next if source_paths_with_links.include?(source_path) issues.add( - code: :low_inbound_anchor_links, + code: :page_has_no_outgoing_links, severity: :warning, category: :links, - url: target_url, - message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}", - details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples} + url: page.url, + message: "page has no outgoing internal links", + details: {} + ) + end + end + + def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues) + sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set + reported_urls = Set.new + + resolved_links.each do |link| + final_url = link[:final_url] + next if sitemap_urls.include?(final_url) + next if reported_urls.include?(final_url) + next unless crawlable_path?(link[:final_path]) + + target = resolve_target(final_url) + next unless target.allowed?(@allowed_statuses) && target.html? + + reported_urls << final_url + + issues.add( + code: :indexable_page_missing_from_sitemap, + severity: :warning, + category: :sitemaps, + url: final_url, + message: "indexable internal page is missing from sitemap", + details: {source_url: link[:source_url]} + ) + end + end + + def report_orphan_page(target_url, issues) + issues.add( + code: :orphan_page, + severity: :warning, + category: :links, + url: target_url, + message: "page has no incoming internal links", + details: {} + ) + end + + def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues) + return if dofollow_count.zero? + return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS + + source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR) + source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : "" + + issues.add( + code: :low_dofollow_inlinks, + severity: :warning, + category: :links, + url: target_url, + message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}", + details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples} + ) + end + + def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues) + return unless nofollow_count.positive? && dofollow_count.zero? + + issues.add( + code: :only_nofollow_internal_inlinks, + severity: :warning, + category: :links, + url: target_url, + message: "page has nofollow incoming internal links only", + details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)} + ) + end + + def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues) + return unless nofollow_count.positive? && dofollow_count.positive? + + issues.add( + code: :mixed_follow_internal_inlinks, + severity: :notice, + category: :links, + url: target_url, + message: "page has nofollow and dofollow incoming internal links", + details: { + dofollow_inbound_count: dofollow_count, + nofollow_inbound_count: nofollow_count, + dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR), + nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR) + } + ) + end + + def validate_canonical_targets(urls, pages, resolved_links, issues) + sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact + sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) } + return if sitemap_pages.size < 2 + + dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links) + + sitemap_pages.each do |page| + canonical_url = canonical_url_for(page) + next if canonical_url.nil? + + target_uri = URI.parse(canonical_url) + next if target_uri.host != @base_host + + canonical_path = Url.path(canonical_url) + if canonical_path && dofollow_counts_by_path[canonical_path].zero? + issues.add( + code: :canonical_no_internal_inlinks, + severity: :warning, + category: :links, + url: canonical_url, + message: "canonical URL has no incoming internal links", + details: {source_url: page.url} + ) + end + + validate_canonical_target_status(page, canonical_url, issues) + rescue URI::InvalidURIError + next + end + end + + def dofollow_counts_by_final_path(resolved_links) + resolved_links.each_with_object(Hash.new(0)) do |link, counts| + next if link[:nofollow] + next if link[:source_path] == link[:final_path] + + counts[link[:final_path]] += 1 + end + end + + def canonical_url_for(page) + canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip + return if canonical.empty? + + Url.normalize(canonical, base_url: page.url) + end + + def validate_canonical_target_status(page, canonical_url, issues) + target = resolve_target(canonical_url) + + if target.unresolved? || target.ignored_error? + return + end + + if target.redirect? + issues.add( + code: :canonical_points_to_redirect, + severity: :warning, + category: :metadata, + url: page.url, + message: "canonical points to redirect", + details: {canonical: canonical_url, final_url: target.final_url, status: target.status} + ) + elsif !target.allowed?(@allowed_statuses) + issues.add( + code: :canonical_points_to_error, + severity: :warning, + category: :metadata, + url: page.url, + message: "canonical points to HTTP #{target.status}", + details: {canonical: canonical_url, status: target.status} ) end end diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb index 9f261f2..883ca58 100644 --- a/lib/crawlscope/rules/metadata.rb +++ b/lib/crawlscope/rules/metadata.rb @@ -18,22 +18,41 @@ def initialize(site_name: nil) end def call(urls:, pages:, issues:, context: nil) + sitemap_urls = normalized_sitemap_urls(urls) + pages.each do |page| next unless page.html? validate_h1(page, issues) validate_title(page, issues) validate_description(page, issues) - validate_canonical(page, issues) + validate_canonical(page, issues, sitemap_urls) validate_open_graph(page, issues) end end private + def normalized_sitemap_urls(urls) + urls.map { |url| Url.normalize(url, base_url: url) }.compact + end + def validate_h1(page, issues) h1s = page.doc.css("h1") - return if h1s.one? + empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? } + + if empty_h1s.any? + issues.add( + code: :empty_h1, + severity: :warning, + category: :metadata, + url: page.url, + message: "empty <h1>", + details: {count: empty_h1s.size} + ) + end + + return if h1s.one? && empty_h1s.empty? if h1s.empty? issues.add( @@ -57,7 +76,19 @@ def validate_h1(page, issues) end def validate_title(page, issues) - title = page.doc.at_css("title")&.text.to_s.strip + titles = page.doc.css("head > title") + title = titles.first&.text.to_s.strip + + if titles.size > 1 + issues.add( + code: :multiple_title_tags, + severity: :warning, + category: :metadata, + url: page.url, + message: "multiple <title> tags (#{titles.size})", + details: {count: titles.size} + ) + end if title.empty? issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {}) @@ -69,7 +100,19 @@ def validate_title(page, issues) end def validate_description(page, issues) - description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip + descriptions = page.doc.css('head > meta[name="description"]') + description = descriptions.first&.[]("content").to_s.strip + + if descriptions.size > 1 + issues.add( + code: :multiple_meta_descriptions, + severity: :warning, + category: :metadata, + url: page.url, + message: "multiple meta description tags (#{descriptions.size})", + details: {count: descriptions.size} + ) + end if description.empty? issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {}) @@ -80,7 +123,7 @@ def validate_description(page, issues) end end - def validate_canonical(page, issues) + def validate_canonical(page, issues, sitemap_urls) canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip if canonical.empty? @@ -92,13 +135,25 @@ def validate_canonical(page, issues) normalized_page_url = Url.normalize(page.url, base_url: page.url) return if canonical_matches_page?(normalized_canonical, normalized_page_url) + details = {canonical: canonical} issues.add( code: :canonical_mismatch, severity: :warning, category: :metadata, url: page.url, message: "canonical mismatch (#{canonical})", - details: {canonical: canonical} + details: details + ) + + return unless sitemap_urls.include?(normalized_page_url) + + issues.add( + code: :non_canonical_page_in_sitemap, + severity: :warning, + category: :sitemaps, + url: page.url, + message: "non-canonical page is included in sitemap", + details: details ) end diff --git a/lib/crawlscope/rules/structured_data.rb b/lib/crawlscope/rules/structured_data.rb index 1d19719..e04cf9b 100644 --- a/lib/crawlscope/rules/structured_data.rb +++ b/lib/crawlscope/rules/structured_data.rb @@ -55,6 +55,8 @@ def validate_page(page, issues, schema_registry) next end + validate_type_presence(page, source, data, issues) + errors = schema_registry.validate(data) next if errors.empty? @@ -96,6 +98,35 @@ def validate_job_posting_count(page, items, issues) end end + def validate_type_presence(page, source, data, issues) + missing_paths = missing_type_paths(data) + return if missing_paths.empty? + + issues.add( + code: :structured_data_missing_type, + severity: :warning, + category: :structured_data, + url: page.url, + message: "#{source} structured data missing @type", + details: {paths: missing_paths, source: source} + ) + end + + def missing_type_paths(data, path = "$") + return [] unless data.is_a?(Hash) + + paths = [] + paths << path if data["@type"].to_s.strip.empty? + + if data["@graph"].is_a?(Array) + data["@graph"].each_with_index do |entry, index| + paths.concat(missing_type_paths(entry, "#{path}.@graph[#{index}]")) + end + end + + paths + end + def structured_data_types(data) return [] unless data.is_a?(Hash) diff --git a/lib/crawlscope/rules/uniqueness.rb b/lib/crawlscope/rules/uniqueness.rb index 0231398..95519f9 100644 --- a/lib/crawlscope/rules/uniqueness.rb +++ b/lib/crawlscope/rules/uniqueness.rb @@ -58,6 +58,7 @@ def summary_for(page) { content_fingerprint_digest: content_fingerprint_digest(page.doc), + canonical: page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip, description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip, shingles: shingles_for(tokens), title: page.doc.at_css("title")&.text.to_s.strip, @@ -98,6 +99,27 @@ def validate_duplicates(page_summaries, issues) details: {urls: urls} ) end + + duplicate_content_clusters_without_canonical(page_summaries).each do |urls| + issues.add( + code: :duplicate_pages_without_canonical, + severity: :warning, + category: :uniqueness, + url: nil, + message: "duplicate pages without canonical => #{urls.join(", ")}", + details: {urls: urls} + ) + end + end + + def duplicate_content_clusters_without_canonical(page_summaries) + page_summaries + .select { |page| !page[:content_fingerprint_digest].nil? } + .group_by { |page| page[:content_fingerprint_digest] } + .values + .select { |pages| pages.size > 1 } + .select { |pages| pages.any? { |page| page[:canonical].to_s.empty? } } + .map { |pages| pages.map { |page| page[:url] } } end def shingles_for(tokens) diff --git a/lib/crawlscope/sitemap.rb b/lib/crawlscope/sitemap.rb index 38e86da..b2b1eef 100644 --- a/lib/crawlscope/sitemap.rb +++ b/lib/crawlscope/sitemap.rb @@ -25,6 +25,9 @@ def collect_urls(source, base_url:, visited:) visited.add(source) document = Nokogiri::XML(read(source)) root_name = document.root&.name + unless %w[sitemapindex urlset].include?(root_name) + raise ValidationError, "Sitemap #{source} has unexpected root #{root_name.inspect}" + end if root_name == "sitemapindex" document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node| @@ -40,7 +43,12 @@ def collect_urls(source, base_url:, visited:) def read(source) if Url.remote?(source) - connection.get(source).body + response = connection.get(source) + unless response.status.to_i.between?(200, 299) + raise ValidationError, "Sitemap #{source} returned HTTP #{response.status}" + end + + response.body else File.read(source) end diff --git a/lib/tasks/crawlscope_tasks.rake b/lib/tasks/crawlscope_tasks.rake index 046475f..80ad80c 100644 --- a/lib/tasks/crawlscope_tasks.rake +++ b/lib/tasks/crawlscope_tasks.rake @@ -1,43 +1,43 @@ namespace :crawlscope do - desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" - task validate: :environment do - Crawlscope::RakeTasks.validate + desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY" + task :validate, [:url, :sitemap, :rules] => :environment do |_task, args| + Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules]) end namespace :validate do - desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" - task ldjson: :environment do - Crawlscope::RakeTasks.ldjson + desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" + task :ldjson, [:url] => :environment do |_task, args| + Crawlscope::RakeTasks.ldjson(urls: args[:url]) end - desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1" - task indexability: :environment do - Crawlscope::RakeTasks.validate_rule("indexability") + desc "Validate URLs with the indexability rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :indexability, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("indexability", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1" - task metadata: :environment do - Crawlscope::RakeTasks.validate_rule("metadata") + desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :metadata, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1" - task structured_data: :environment do - Crawlscope::RakeTasks.validate_rule("structured_data") + desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :structured_data, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1" - task uniqueness: :environment do - Crawlscope::RakeTasks.validate_rule("uniqueness") + desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :uniqueness, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1" - task content_quality: :environment do - Crawlscope::RakeTasks.validate_rule("content_quality") + desc "Validate URLs with the content_quality rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :content_quality, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("content_quality", url: args[:url], sitemap_path: args[:sitemap]) end - desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1" - task links: :environment do - Crawlscope::RakeTasks.validate_rule("links") + desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1" + task :links, [:url, :sitemap] => :environment do |_task, args| + Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap]) end end end diff --git a/test/crawlscope/cli_test.rb b/test/crawlscope/cli_test.rb index 6f876d2..b520755 100644 --- a/test/crawlscope/cli_test.rb +++ b/test/crawlscope/cli_test.rb @@ -265,6 +265,7 @@ def test_validation_errors_return_failed_status_without_reraising assert_equal 1, status assert_includes err.string, "No URLs found in sitemap" + refute_includes err.string, "Usage:" end private diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb index 4b21191..6940107 100644 --- a/test/crawlscope/crawl_test.rb +++ b/test/crawlscope/crawl_test.rb @@ -188,4 +188,30 @@ def fetch(url) assert_equal ["https://example.com/pricing"], fake_browser.urls assert fake_browser.closed end + + def test_reports_sitemap_redirect_url + File.write( + @sitemap_path, + <<~XML + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <url><loc>https://example.com/old</loc></url> + </urlset> + XML + ) + + stub_request(:get, "https://example.com/old") + .to_return(status: 301, headers: {"Location" => "https://example.com/new"}, body: "") + stub_request(:get, "https://example.com/new") + .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Moved</body></html>") + + result = Crawlscope::Crawl.new( + base_url: "https://example.com", + sitemap_path: @sitemap_path, + rules: [], + schema_registry: Crawlscope::SchemaRegistry.default + ).call + + assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url + end end diff --git a/test/crawlscope/indexability_rule_test.rb b/test/crawlscope/indexability_rule_test.rb index c5353a0..7174060 100644 --- a/test/crawlscope/indexability_rule_test.rb +++ b/test/crawlscope/indexability_rule_test.rb @@ -20,6 +20,39 @@ def test_reports_meta_noindex assert_equal :noindex_meta, issue.code assert_equal :error, issue.severity assert_equal "noindex, follow", issue.details[:content] + + codes = issues.to_a.map(&:code) + assert_includes codes, :noindex_follow_meta + assert_includes codes, :sitemap_noindex_url + end + + def test_reports_meta_nofollow + issues = Crawlscope::IssueCollection.new + page = page_with( + body: <<~HTML + <html> + <head><meta name="robots" content="nofollow"></head> + <body><main>Visible content</main></body> + </html> + HTML + ) + + Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues) + + assert_equal [:nofollow_meta], issues.to_a.map(&:code) + end + + def test_reports_noindex_nofollow_header + issues = Crawlscope::IssueCollection.new + page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"}) + + Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues) + + codes = issues.to_a.map(&:code) + assert_includes codes, :noindex_header + assert_includes codes, :nofollow_header + assert_includes codes, :noindex_nofollow_header + assert_includes codes, :sitemap_noindex_url end def test_reports_x_robots_tag_noindex diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index 8bb8617..6873ca3 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -41,7 +41,7 @@ def test_reports_broken_internal_links context: context ) - assert_equal [:broken_internal_link], issues.to_a.map(&:code) + assert_includes issues.to_a.map(&:code), :broken_internal_link assert_includes issues.to_a.first.message, "HTTP 404" end @@ -114,8 +114,151 @@ def test_reports_low_inbound_anchor_links context: context ) - assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code) - assert_equal "https://example.com/guide", issues.to_a.first.url + orphan_issue = issues.to_a.find { |item| item.code == :orphan_page } + assert orphan_issue + assert_includes issues.to_a.map(&:code), :low_dofollow_inlinks + assert_equal "https://example.com/guide", orphan_issue.url + end + + def test_reports_pages_with_no_outgoing_internal_links + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide", "https://example.com/pricing"], + pages: [ + page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"), + page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>") + ], + issues: issues, + context: context + ) + + issue = issues.to_a.find { |item| item.code == :page_has_no_outgoing_links } + assert issue + assert_equal "https://example.com/pricing", issue.url + end + + def test_reports_nofollow_outlinks_and_inlink_follow_mix + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide", "https://example.com/pricing", "https://example.com/about"], + pages: [ + page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a><a href=\"/about\">About</a></main>"), + page(url: "https://example.com/about", body: "<main><a href=\"/pricing\">Pricing</a></main>"), + page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>") + ], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} }) + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :nofollow_internal_outlinks + assert_includes codes, :mixed_follow_internal_inlinks + end + + def test_reports_only_nofollow_internal_inlinks + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide", "https://example.com/pricing"], + pages: [ + page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a></main>"), + page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>") + ], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} }) + ) + + assert_includes issues.to_a.map(&:code), :only_nofollow_internal_inlinks + end + + def test_reports_https_pages_linking_to_internal_http_urls + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "<main><a href=\"http://example.com/pricing\">Pricing</a></main>")], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} }) + ) + + assert_includes issues.to_a.map(&:code), :http_internal_link + end + + def test_reports_canonical_target_link_issues + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + redirects = target_url == "https://example.com/canonical-about" + status = redirects ? 301 : 200 + final_url = redirects ? "https://example.com/about" : target_url + {crawled: false, error: nil, final_url: final_url, status: status} + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide", "https://example.com/about"], + pages: [ + page(url: "https://example.com/guide", body: "<main><a href=\"/about\">About</a></main>"), + page( + url: "https://example.com/about", + body: <<~HTML + <html> + <head><link rel="canonical" href="https://example.com/canonical-about"></head> + <body><main><p>About</p></main></body> + </html> + HTML + ) + ], + issues: issues, + context: context(resolver: resolver) + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :canonical_no_internal_inlinks + assert_includes codes, :canonical_points_to_redirect + end + + def test_reports_indexable_internal_pages_missing_from_sitemap + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + error: nil, + final_url: target_url, + html: true, + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")], + issues: issues, + context: context(resolver: resolver) + ) + + issue = issues.to_a.find { |item| item.code == :indexable_page_missing_from_sitemap } + assert issue + assert_equal "https://example.com/hidden", issue.url + end + + def test_reports_url_hygiene_issues + issues = Crawlscope::IssueCollection.new + long_path = "a" * 2_050 + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com//bad", "https://example.com/#{long_path}"], + pages: [ + page(url: "https://example.com//bad", body: "<main><a href=\"/ok\">OK</a></main>"), + page(url: "https://example.com/#{long_path}", body: "<main><a href=\"/ok\">OK</a></main>") + ], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: false, error: nil, final_url: target_url, html: true, status: 200} }) + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :url_double_slash + assert_includes codes, :url_too_long end def test_counts_root_page_links_as_inbound_links @@ -217,6 +360,7 @@ def resolve_target(target_url) crawled: true, error: nil, final_url: target_url, + html: true, status: 200 } when "https://example.com/missing" @@ -224,6 +368,7 @@ def resolve_target(target_url) crawled: false, error: nil, final_url: target_url, + html: false, status: 404 } end diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb index 694404f..6715c4f 100644 --- a/test/crawlscope/metadata_rule_test.rb +++ b/test/crawlscope/metadata_rule_test.rb @@ -48,6 +48,42 @@ def test_allows_localhost_page_with_matching_production_canonical_path refute_includes issues.to_a.map(&:code), :canonical_mismatch end + def test_reports_multiple_title_multiple_descriptions_empty_h1_and_sitemap_canonical_mismatch + issues = Crawlscope::IssueCollection.new + invalid_page = page( + body: <<~HTML + <html> + <head> + <title>About + Duplicate About + + + + + + + + + +

+ + HTML + ) + + Crawlscope::Rules::Metadata.new.call( + urls: [invalid_page.url], + pages: [invalid_page], + issues: issues + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :multiple_title_tags + assert_includes codes, :multiple_meta_descriptions + assert_includes codes, :empty_h1 + assert_includes codes, :canonical_mismatch + assert_includes codes, :non_canonical_page_in_sitemap + end + private def page(url: "https://example.com/about", body: nil) diff --git a/test/crawlscope/rake_tasks_test.rb b/test/crawlscope/rake_tasks_test.rb new file mode 100644 index 0000000..e33d829 --- /dev/null +++ b/test/crawlscope/rake_tasks_test.rb @@ -0,0 +1,70 @@ +# frozen_string_literal: true + +require "test_helper" + +class CrawlscopeRakeTasksTest < Minitest::Test + def setup + @original_start = Crawlscope::Cli.method(:start) + end + + def teardown + singleton_class = class << Crawlscope::Cli; self; end + original_start = @original_start + singleton_class.define_method(:start) do |*args, **kwargs| + original_start.call(*args, **kwargs) + end + end + + def test_validate_passes_rake_arguments_to_cli + calls = capture_cli_calls + + Crawlscope::RakeTasks.validate( + url: "http://localhost:3001", + sitemap_path: "http://localhost:3001/sitemap.xml", + rule_names: "metadata,links" + ) + + assert_equal( + ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata,links"], + calls.fetch(0).fetch(:argv) + ) + end + + def test_validate_rule_passes_rule_and_rake_arguments_to_cli + calls = capture_cli_calls + + Crawlscope::RakeTasks.validate_rule( + "metadata", + url: "http://localhost:3001", + sitemap_path: "http://localhost:3001/sitemap.xml" + ) + + assert_equal( + ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata"], + calls.fetch(0).fetch(:argv) + ) + end + + def test_ldjson_passes_rake_url_argument_to_cli + calls = capture_cli_calls + + Crawlscope::RakeTasks.ldjson(urls: "http://localhost:3001/article") + + assert_equal( + ["ldjson", "--url", "http://localhost:3001/article"], + calls.fetch(0).fetch(:argv) + ) + end + + private + + def capture_cli_calls + calls = [] + singleton_class = class << Crawlscope::Cli; self; end + singleton_class.define_method(:start) do |argv, **kwargs| + calls << {argv: argv, kwargs: kwargs} + 0 + end + calls + end +end diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb index 9b2d8b2..2fcf75c 100644 --- a/test/crawlscope/reporter_test.rb +++ b/test/crawlscope/reporter_test.rb @@ -23,7 +23,7 @@ def test_reports_ok_result refute_includes output, "Status: FAILED" end - def test_reports_failed_result_with_severity_counts + def test_reports_failed_result_with_grouped_counts_and_offenses io = StringIO.new issues = Crawlscope::IssueCollection.new issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing ", details: {}) @@ -42,9 +42,13 @@ def test_reports_failed_result_with_severity_counts assert_includes output, "Status: FAILED" assert_includes output, "Issues: 2" + assert_includes output, "Severity:" assert_includes output, "notice: 1" assert_includes output, "warning: 1" - assert_includes output, "- [warning] https://example.com/a missing <title>" - assert_includes output, "- [notice] https://example.com/b broken internal link" + assert_includes output, "Category:" + assert_includes output, "links: 1" + assert_includes output, "metadata: 1" + assert_includes output, " - [warning] missing_title https://example.com/a missing <title>" + assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link" end end diff --git a/test/crawlscope/sitemap_test.rb b/test/crawlscope/sitemap_test.rb index e277b1d..33c8151 100644 --- a/test/crawlscope/sitemap_test.rb +++ b/test/crawlscope/sitemap_test.rb @@ -49,6 +49,30 @@ def test_parses_remote_sitemap_index_with_child_sitemap assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com") end + def test_remote_sitemap_http_error_is_explicit + stub_request(:get, "https://www.example.com/sitemap.xml") + .to_return(status: 500, body: "<html><body>Error</body></html>") + + parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml") + + error = assert_raises(Crawlscope::ValidationError) do + parser.urls(base_url: "https://www.example.com") + end + assert_equal "Sitemap https://www.example.com/sitemap.xml returned HTTP 500", error.message + end + + def test_invalid_sitemap_root_is_explicit + stub_request(:get, "https://www.example.com/sitemap.xml") + .to_return(status: 200, body: "<html><body>Error</body></html>") + + parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml") + + error = assert_raises(Crawlscope::ValidationError) do + parser.urls(base_url: "https://www.example.com") + end + assert_equal 'Sitemap https://www.example.com/sitemap.xml has unexpected root "html"', error.message + end + def test_rebases_remote_sitemap_index_children_to_base_url stub_request(:get, "http://localhost:3000/sitemap.xml") .to_return( diff --git a/test/crawlscope/structured_data_rule_test.rb b/test/crawlscope/structured_data_rule_test.rb index dd9e10c..fc5959c 100644 --- a/test/crawlscope/structured_data_rule_test.rb +++ b/test/crawlscope/structured_data_rule_test.rb @@ -79,6 +79,62 @@ def test_reports_missing_structured_data_for_html_pages assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources] end + def test_reports_structured_data_missing_type + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::StructuredData.new + page = page( + url: "https://example.com/articles/test", + body: <<~HTML + <html> + <head> + <script type="application/ld+json"> + {"@context":"https://schema.org","headline":"Untyped article"} + </script> + </head> + <body><h1>Article</h1></body> + </html> + HTML + ) + + rule.call( + urls: [page.url], + pages: [page], + issues: issues, + context: {schema_registry: Crawlscope::SchemaRegistry.default} + ) + + assert_includes issues.to_a.map(&:code), :structured_data_missing_type + end + + def test_reports_graph_entries_missing_type + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::StructuredData.new + page = page( + url: "https://example.com/articles/test", + body: <<~HTML + <html> + <head> + <script type="application/ld+json"> + {"@context":"https://schema.org","@type":"WebPage","@graph":[{"name":"Untyped node"}]} + </script> + </head> + <body><h1>Article</h1></body> + </html> + HTML + ) + + rule.call( + urls: [page.url], + pages: [page], + issues: issues, + context: {schema_registry: Crawlscope::SchemaRegistry.default} + ) + + issue = issues.to_a.find { |item| item.code == :structured_data_missing_type } + assert issue + assert_equal ["$.@graph[0]"], issue.details[:paths] + end + def test_validates_job_posting_markup issues = Crawlscope::IssueCollection.new rule = Crawlscope::Rules::StructuredData.new diff --git a/test/crawlscope/uniqueness_rule_test.rb b/test/crawlscope/uniqueness_rule_test.rb index c8f958d..c770353 100644 --- a/test/crawlscope/uniqueness_rule_test.rb +++ b/test/crawlscope/uniqueness_rule_test.rb @@ -13,7 +13,20 @@ def test_reports_duplicate_title_description_and_content rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {}) - assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort + assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_pages_without_canonical duplicate_title].sort, issues.to_a.map(&:code).sort + end + + def test_allows_duplicate_pages_when_canonicals_are_present + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::Uniqueness.new + pages = [ + page(url: "https://example.com/a", canonical: "https://example.com/a"), + page(url: "https://example.com/b", canonical: "https://example.com/a") + ] + + rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {}) + + refute_includes issues.to_a.map(&:code), :duplicate_pages_without_canonical end def test_reports_near_duplicate_content @@ -59,13 +72,15 @@ def near_duplicate_content(adjective) TEXT end - def page(url:, content: nil) + def page(url:, content: nil, canonical: nil) repeated_text = content || ("Useful content " * 30).strip + canonical_tag = canonical ? %(<link rel="canonical" href="#{canonical}">) : "" body = <<~HTML <html> <head> <title>Example Title + #{canonical_tag}
#{repeated_text}