diff --git a/README.md b/README.md
index bb6828b..b53a20d 100644
--- a/README.md
+++ b/README.md
@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
 
 ```bash
 bundle exec rake crawlscope:validate URL=https://example.com
+bundle exec rake 'crawlscope:validate[https://example.com]'
 bundle exec rake crawlscope:validate:metadata URL=https://example.com
+bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
 bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
+bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
 ```
 
 `crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
 
+Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
+task-argument form above instead.
+
 `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 
 ### Structured Data URL Audit
diff --git a/Rakefile b/Rakefile
index eac8d3d..840ad74 100644
--- a/Rakefile
+++ b/Rakefile
@@ -183,35 +183,35 @@ namespace :release do
 end
 
 namespace :crawlscope do
-  desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
-  task :validate do
-    Crawlscope::RakeTasks.validate
+  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  task :validate, [:url, :sitemap, :rules] do |_task, args|
+    Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
   end
 
   namespace :validate do
-    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
-    task :ldjson do
-      Crawlscope::RakeTasks.ldjson
+    desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    task :ldjson, [:url] do |_task, args|
+      Crawlscope::RakeTasks.ldjson(urls: args[:url])
     end
 
-    desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
-    task :metadata do
-      Crawlscope::RakeTasks.validate_rule("metadata")
+    desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :metadata, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
-    task :structured_data do
-      Crawlscope::RakeTasks.validate_rule("structured_data")
+    desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :structured_data, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
-    task :uniqueness do
-      Crawlscope::RakeTasks.validate_rule("uniqueness")
+    desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :uniqueness, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
-    task :links do
-      Crawlscope::RakeTasks.validate_rule("links")
+    desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :links, [:url, :sitemap] do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
     end
   end
 end
diff --git a/docs/ahrefs-check-candidates.md b/docs/ahrefs-check-candidates.md
new file mode 100644
index 0000000..edab201
--- /dev/null
+++ b/docs/ahrefs-check-candidates.md
@@ -0,0 +1,236 @@
+# Ahrefs Issue Crosswalk
+
+This crosswalk compares the Ahrefs issue list against Crawlscope's current
+rules and identifies checks that fit the gem's sitemap-driven, internal-audit
+scope.
+
+## Current Coverage
+
+- `noindex_meta`, `noindex_header`: covers "Noindex page" via meta robots and
+  `X-Robots-Tag`.
+- `redirected_page`: covers crawled sitemap URLs that redirect.
+- `internal_link_redirects`: covers "Page has links to redirect".
+- `low_inbound_anchor_links`: partly covers "Page has only one dofollow
+  incoming internal link", but currently does not distinguish dofollow from
+  nofollow.
+- `title_too_long`: covers "Title too long".
+- `meta_description_too_short`, `meta_description_too_long`: covers meta
+  description length checks.
+- `missing_title`, `missing_meta_description`, `missing_h1`, `multiple_h1`,
+  `missing_canonical`, `canonical_mismatch`, and
+  `incomplete_open_graph_tags`: covers adjacent metadata checks not present in
+  the pasted Ahrefs list.
+- `structured_data_parse_error`, `structured_data_schema_error`: covers
+  schema.org-style validation errors.
+- `missing_structured_data`, `missing_job_posting`, `multiple_job_postings`:
+  covers structured-data presence and domain-specific JobPosting rules.
+- `duplicate_title`, `duplicate_meta_description`,
+  `duplicate_content_fingerprint`, `near_duplicate_content`: covers uniqueness
+  checks beyond the pasted Ahrefs list.
+- `thin_visible_text`, `low_visible_text_ratio`, `low_unique_token_ratio`:
+  covers content-quality checks beyond the pasted Ahrefs list.
+- `unexpected_status`, `fetch_failed`, `broken_internal_link`: covers status
+  and broken-link checks beyond the pasted Ahrefs list.
+
+## Candidate Checks To Add
+
+### High Fit
+
+1. `nofollow_meta`
+   - Ahrefs match: "Nofollow page".
+   - Detect `nofollow` in meta robots and `X-Robots-Tag`.
+   - This should share parsing with the existing noindex checks so
+     `noindex,nofollow`, `none`, and scoped directives are handled consistently.
+
+2. `noindex_follow_meta` / `noindex_follow_header`
+   - Ahrefs match: "Noindex follow page".
+   - Detect pages that explicitly combine `noindex` with `follow`.
+   - This is useful because current Crawlscope reports noindex but does not
+     classify whether internal links remain followable.
+
+3. `noindex_nofollow_meta` / `noindex_nofollow_header`
+   - Ahrefs match: "Noindex and nofollow page".
+   - Detect pages that block indexing and link following.
+   - Could also treat `none` as equivalent to `noindex,nofollow`.
+
+4. `canonical_no_internal_inlinks`
+   - Ahrefs match: "Canonical URL has no incoming internal links".
+   - For each sitemap URL, resolve its canonical target and count internal
+     links to the canonical URL/path.
+   - This is more precise than `low_inbound_anchor_links`, which counts links
+     to the crawled URL/final path rather than canonical target.
+
+5. `nofollow_internal_outlinks`
+   - Ahrefs match: "Page has nofollow outgoing internal links".
+   - Extend link extraction to retain `rel` attributes and flag internal
+     anchors with `rel~="nofollow"`.
+   - This also unlocks incoming nofollow/dofollow mix checks.
+
+6. `only_nofollow_internal_inlinks`
+   - Ahrefs match: "Page has nofollow incoming internal links only".
+   - Count incoming internal links by follow state.
+   - Report when a sitemap URL has at least one internal inlink but zero
+     dofollow internal inlinks.
+
+7. `mixed_follow_internal_inlinks`
+   - Ahrefs match: "Page has nofollow and dofollow incoming internal links".
+   - Count incoming internal links by follow state.
+   - Report when both counts are positive, with source samples for each class.
+
+8. `low_dofollow_inlinks`
+   - Ahrefs match: "Page has only one dofollow incoming internal link".
+   - Replace or supplement `low_inbound_anchor_links` with a dofollow-specific
+     count.
+   - Make the threshold configurable; Ahrefs surfaces "only one", but a host
+     app may want `minimum_dofollow_inlinks = 2`.
+
+9. `indexable_page_missing_from_sitemap`
+   - Ahrefs match: "Indexable page not in sitemap".
+   - Use discovered internal links to find crawlable, indexable HTML pages that
+     are not present in the sitemap.
+   - This requires Crawlscope to keep discovered internal targets, not only
+     sitemap URLs, so it is a larger links/crawl contract change.
+
+10. `sitemap_noindex_url`
+    - Ahrefs adjacent documented issue: "Noindex page in sitemap".
+    - Current noindex checks run on sitemap pages, but this issue name would
+      make the sitemap/indexability conflict explicit.
+    - It can be implemented as a second issue emitted from the indexability
+      rule, or as a dedicated sitemap consistency rule.
+
+11. `sitemap_redirect_url`
+    - Ahrefs match: "3XX redirect" and adjacent "3xx redirect in sitemap".
+    - Current `redirected_page` covers this behavior; adding this alias/code
+      would make reports match external audit tools more directly.
+
+12. `http_internal_link`
+    - Ahrefs adjacent issue: "HTTPS page has internal links to HTTP".
+    - Detect internal links from HTTPS pages to HTTP URLs on the same host.
+    - This is more actionable than relying on redirect detection after fetch.
+
+13. `canonical_points_to_redirect`
+    - Ahrefs adjacent issue: "Canonical points to redirect".
+    - Resolve canonical targets and flag canonical URLs that redirect.
+    - This fits the existing canonical metadata rule but needs target
+      resolution similar to the links rule.
+
+14. `canonical_points_to_error`
+    - Ahrefs adjacent issues: "Canonical points to 4XX" and "Canonical points
+      to 5XX".
+    - Resolve canonical targets and flag non-success responses.
+
+### Medium Fit
+
+15. `multiple_title_tags`
+    - Ahrefs documented issue: "Multiple title tags".
+    - Current metadata rule checks missing/long/repeated title, but not
+      multiple `<title>` elements.
+
+16. `multiple_meta_descriptions`
+    - Ahrefs documented issue: "Multiple meta description tags".
+    - Current metadata rule checks missing/short/long description, but not
+      duplicate description tags on the same page.
+
+17. `empty_h1`
+    - Ahrefs documented issue: "H1 tag missing or empty".
+    - Current `missing_h1` only checks the element count. An empty `<h1>` should
+      be reported separately or treated as missing.
+
+18. `page_has_no_outgoing_links`
+    - Ahrefs documented issue: "Page has no outgoing links".
+    - Detect indexable HTML pages with zero meaningful outgoing anchors.
+    - The rule should ignore skipped Rails/CDN/mail/tel links consistently with
+      the current links rule.
+
+19. `orphan_page`
+    - Ahrefs documented issue: "Orphan page".
+    - Similar to `canonical_no_internal_inlinks`, but for page URL rather than
+      final canonical URL.
+    - This overlaps with a configurable `low_dofollow_inlinks` threshold of
+      one, so avoid double-reporting.
+
+20. `non_canonical_page_in_sitemap`
+    - Ahrefs documented issue: "Non-canonical page in sitemap".
+    - Current `canonical_mismatch` flags the page-level mismatch; this new code
+      would explicitly state the sitemap contract violation.
+
+21. `duplicate_pages_without_canonical`
+    - Ahrefs documented issue: "Duplicate pages without canonical".
+    - Current uniqueness rules find duplicate content/title/description. This
+      would connect duplicate clusters to canonical presence and consistency.
+
+22. `url_double_slash`
+    - Ahrefs documented issue: "Double slash in URL".
+    - Detect sitemap or internal-link URLs whose path contains accidental
+      double slashes.
+    - Low implementation cost, but lower impact than link/indexability checks.
+
+23. `url_too_long`
+    - Ahrefs Page Explorer exposes URL length.
+    - Useful as a notice-level metadata/url hygiene check.
+
+24. `structured_data_missing_type`
+    - Ahrefs structured-data docs call out missing `@type`.
+    - Current schema validation may catch this for registered schemas, but a
+      generic issue code would make raw schema.org failures easier to act on.
+
+25. `structured_data_invalid_type`
+    - Ahrefs structured-data docs call out invalid schema types.
+    - Could be implemented only if Crawlscope adopts or vendors a schema.org
+      vocabulary source; otherwise keep using configured JSON schemas.
+
+26. `structured_data_invalid_property`
+    - Ahrefs structured-data docs call out invalid schema properties.
+    - Same caveat as invalid type: this needs schema.org vocabulary validation,
+      not only local JSON schemas.
+
+### Lower Fit Or Requires External Data
+
+27. `noindex_page_became_indexable`
+    - Ahrefs match: "Noindex page became indexable".
+    - Requires historical crawl snapshots. Not a good fit until Crawlscope has
+      persistence or report comparison.
+
+28. `h1_changed`, `meta_description_changed`, `title_tag_changed`,
+    `serp_title_changed`
+    - Ahrefs match: changed-content issues.
+    - Requires prior crawl snapshots or external SERP data. Not a current
+      stateless gem fit.
+
+29. `page_and_serp_titles_do_not_match`
+    - Ahrefs match: SERP title mismatch.
+    - Requires search-result data. Out of scope for a deterministic sitemap
+      crawler unless a host app injects SERP observations.
+
+30. `pages_added_to_sitemaps`
+    - Ahrefs match: "Pages added to sitemaps".
+    - Requires comparing current sitemap URLs with a prior crawl.
+
+31. `pages_to_submit_to_indexnow`
+    - Ahrefs match: "Pages to submit to IndexNow".
+    - Requires change detection and an IndexNow integration. Better as a host
+      app workflow than a default Crawlscope rule.
+
+32. `google_rich_results_validation_error`
+    - Ahrefs match: "Structured data has Google rich results validation error".
+    - Crawlscope can validate local schema contracts today, but Google rich
+      result validation requires Google-specific rule coverage and may diverge
+      from schema.org validation. Add only if the gem owns those feature
+      schemas explicitly.
+
+## Recommended First Batch
+
+1. Add a shared robots directive parser and emit `nofollow_meta`,
+   `noindex_follow_*`, and `noindex_nofollow_*`.
+2. Extend link extraction with follow state and emit `nofollow_internal_outlinks`,
+   `only_nofollow_internal_inlinks`, `mixed_follow_internal_inlinks`, and
+   `low_dofollow_inlinks`.
+3. Add canonical target checks:
+   `canonical_no_internal_inlinks`, `canonical_points_to_redirect`, and
+   `canonical_points_to_error`.
+4. Add sitemap consistency aliases for already-observed conditions:
+   `sitemap_noindex_url`, `sitemap_redirect_url`, and
+   `non_canonical_page_in_sitemap`.
+5. Add simple metadata hygiene checks:
+   `multiple_title_tags`, `multiple_meta_descriptions`, and `empty_h1`.
+
diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb
index 41c01bc..39f9ac3 100644
--- a/lib/crawlscope/cli.rb
+++ b/lib/crawlscope/cli.rb
@@ -37,11 +37,14 @@ def call
         @err.puts(general_usage)
         1
       end
-    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
+    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
       @err.puts(error.message)
       @err.puts("")
       @err.puts(general_usage)
       1
+    rescue ValidationError => error
+      @err.puts(error.message)
+      1
     end
 
     private
diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
index f32fdef..c0b7374 100644
--- a/lib/crawlscope/crawl.rb
+++ b/lib/crawlscope/crawl.rb
@@ -83,6 +83,7 @@ def collect(pages, issues)
           issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
         elsif redirected?(page)
           issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
+          issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
         end
       end
     end
@@ -127,6 +128,7 @@ def resolution(page, normalized_url, crawled:)
         crawled: crawled,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        html: page.html?,
         status: page.status
       }
     end
diff --git a/lib/crawlscope/rake_tasks.rb b/lib/crawlscope/rake_tasks.rb
index 5acdbae..e3e20be 100644
--- a/lib/crawlscope/rake_tasks.rb
+++ b/lib/crawlscope/rake_tasks.rb
@@ -4,25 +4,40 @@ module Crawlscope
   module RakeTasks
     module_function
 
-    def validate
-      run("validate")
+    def validate(url: nil, sitemap_path: nil, rule_names: nil)
+      run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
     end
 
-    def ldjson
-      run("ldjson")
+    def ldjson(urls: nil)
+      run("ldjson", argv: ldjson_argv(urls: urls))
     end
 
-    def validate_rule(rule)
-      original_rules = ENV["RULES"]
-      ENV["RULES"] = rule
-      validate
-    ensure
-      ENV["RULES"] = original_rules
+    def validate_rule(rule, url: nil, sitemap_path: nil)
+      validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
     end
 
-    def run(command)
-      status = Cli.start([command], out: $stdout, err: $stderr)
+    def run(command, argv: [])
+      status = Cli.start([command, *argv], out: $stdout, err: $stderr)
       exit(status) unless status.zero?
     end
+
+    def validate_argv(url:, sitemap_path:, rule_names:)
+      [
+        option_pair("--url", url),
+        option_pair("--sitemap", sitemap_path),
+        option_pair("--rules", rule_names)
+      ].compact.flatten
+    end
+
+    def ldjson_argv(urls:)
+      Array(urls).flat_map { |url| option_pair("--url", url) }.compact
+    end
+
+    def option_pair(name, value)
+      value = value.to_s.strip
+      return if value.empty?
+
+      [name, value]
+    end
   end
 end
diff --git a/lib/crawlscope/reporter.rb b/lib/crawlscope/reporter.rb
index 48dde93..d159aa9 100644
--- a/lib/crawlscope/reporter.rb
+++ b/lib/crawlscope/reporter.rb
@@ -20,14 +20,29 @@ def report(result)
 
       @io.puts("Status: FAILED")
       @io.puts("Issues: #{result.issues.size}")
+      @io.puts("")
 
-      result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
-        @io.puts("#{severity}: #{issues.size}")
-      end
+      report_grouped_issues("Severity", result.issues.by_severity)
+      @io.puts("")
+      report_grouped_issues("Category", result.issues.by_category)
+    end
+
+    private
 
-      result.issues.each do |issue|
-        @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
+    def report_grouped_issues(title, grouped_issues)
+      @io.puts("#{title}:")
+
+      grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
+        @io.puts("#{name}: #{issues.size}")
+        issues.each do |issue|
+          @io.puts("  - #{offense(issue)}")
+        end
       end
     end
+
+    def offense(issue)
+      parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
+      parts.compact.join(" ")
+    end
   end
 end
diff --git a/lib/crawlscope/rules/indexability.rb b/lib/crawlscope/rules/indexability.rb
index 114d030..06aae36 100644
--- a/lib/crawlscope/rules/indexability.rb
+++ b/lib/crawlscope/rules/indexability.rb
@@ -13,45 +13,110 @@ def initialize
       end
 
       def call(urls:, pages:, issues:, context: nil)
+        sitemap_urls = normalized_sitemap_urls(urls)
+
         pages.each do |page|
-          validate_meta_robots(page, issues) if page.html?
-          validate_x_robots_tag(page, issues)
+          validate_meta_robots(page, issues, sitemap_urls) if page.html?
+          validate_x_robots_tag(page, issues, sitemap_urls)
         end
       end
 
       private
 
+      def normalized_sitemap_urls(urls)
+        urls.map { |url| Url.normalize(url, base_url: url) }.compact
+      end
+
       def header_value(page, name)
         page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
       end
 
-      def noindex?(value)
+      def directives(value)
         value
           .split(",")
           .map { |directive| directive.split(":", 2).last.to_s.strip }
-          .any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+          .reject(&:empty?)
+      end
+
+      def noindex?(value)
+        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+      end
+
+      def follow?(value)
+        directives(value).any? { |directive| directive.casecmp?("follow") }
       end
 
-      def validate_meta_robots(page, issues)
+      def nofollow?(value)
+        directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") }
+      end
+
+      def validate_meta_robots(page, issues, sitemap_urls)
         page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
           content = tag["content"].to_s
-          next unless noindex?(content)
-
-          issues.add(
-            code: :noindex_meta,
-            severity: :error,
-            category: :indexability,
-            url: page.url,
-            message: "robots meta tag prevents indexing",
-            details: {content: content, name: tag["name"].to_s}
-          )
+
+          report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content)
+          report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content)
+          report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content)
+          report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content)
         end
       end
 
-      def validate_x_robots_tag(page, issues)
+      def validate_x_robots_tag(page, issues, sitemap_urls)
         content = header_value(page, X_ROBOTS_TAG_HEADER)
-        return unless noindex?(content)
+        return if content.empty?
+
+        report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content)
+        report_nofollow_header(page, issues, content) if nofollow?(content)
+        report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content)
+        report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content)
+      end
 
+      def report_noindex_meta(page, issues, content, name, sitemap_urls)
+        issues.add(
+          code: :noindex_meta,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing",
+          details: {content: content, name: name}
+        )
+        report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls)
+      end
+
+      def report_nofollow_meta(page, issues, content, name)
+        issues.add(
+          code: :nofollow_meta,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents following links",
+          details: {content: content, name: name}
+        )
+      end
+
+      def report_noindex_follow_meta(page, issues, content, name)
+        issues.add(
+          code: :noindex_follow_meta,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing but allows following links",
+          details: {content: content, name: name}
+        )
+      end
+
+      def report_noindex_nofollow_meta(page, issues, content, name)
+        issues.add(
+          code: :noindex_nofollow_meta,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing and following links",
+          details: {content: content, name: name}
+        )
+      end
+
+      def report_noindex_header(page, issues, content, sitemap_urls)
         issues.add(
           code: :noindex_header,
           severity: :error,
@@ -60,6 +125,54 @@ def validate_x_robots_tag(page, issues)
           message: "X-Robots-Tag header prevents indexing",
           details: {content: content}
         )
+        report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls)
+      end
+
+      def report_nofollow_header(page, issues, content)
+        issues.add(
+          code: :nofollow_header,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents following links",
+          details: {content: content}
+        )
+      end
+
+      def report_noindex_follow_header(page, issues, content)
+        issues.add(
+          code: :noindex_follow_header,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents indexing but allows following links",
+          details: {content: content}
+        )
+      end
+
+      def report_noindex_nofollow_header(page, issues, content)
+        issues.add(
+          code: :noindex_nofollow_header,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents indexing and following links",
+          details: {content: content}
+        )
+      end
+
+      def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:)
+        normalized_url = Url.normalize(page.url, base_url: page.url)
+        return unless sitemap_urls.include?(normalized_url)
+
+        issues.add(
+          code: :sitemap_noindex_url,
+          severity: :error,
+          category: :sitemaps,
+          url: page.url,
+          message: "sitemap URL is noindex",
+          details: {content: content, source: source}
+        )
       end
     end
   end
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
index a17915b..5cf4b27 100644
--- a/lib/crawlscope/rules/links.rb
+++ b/lib/crawlscope/rules/links.rb
@@ -10,6 +10,7 @@ class Links
       LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
       MAX_SOURCES_IN_ERROR = 3
       MIN_INBOUND_ANCHOR_LINKS = 1
+      MIN_DOFOLLOW_INBOUND_LINKS = 2
 
       attr_reader :code
 
@@ -24,10 +25,14 @@ def call(urls:, pages:, issues:, context:)
         @base_host = URI.parse(@base_url).host
 
         links = extract_links(pages)
-        return if links.empty?
-
+        validate_url_hygiene(urls, links, issues)
         resolved_links = resolve_links(links, issues)
+        validate_nofollow_outgoing_links(links, issues)
+        validate_http_internal_links(links, issues)
+        validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+        validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
         validate_inbound_counts(urls, pages, resolved_links, issues)
+        validate_canonical_targets(urls, pages, resolved_links, issues)
       end
 
       private
@@ -64,6 +69,8 @@ def link_for(page:, source_path:, node:)
 
         {
           anchor_text: anchor_text,
+          http_internal_link: http_internal_link?(page.normalized_url, href),
+          nofollow: nofollow_link?(node),
           source_path: source_path,
           source_url: page.normalized_url,
           target_path: target_path,
@@ -86,6 +93,19 @@ def normalize_anchor_text(text)
         text.to_s.gsub(/\s+/, " ").strip
       end
 
+      def nofollow_link?(node)
+        node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") }
+      end
+
+      def http_internal_link?(source_url, href)
+        source_uri = URI.parse(source_url.to_s)
+        target_uri = URI.parse(URI.join(source_url, href).to_s)
+
+        source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host
+      rescue URI::InvalidURIError
+        false
+      end
+
       def normalize_internal_link(source_url, href)
         absolute_url = URI.join(source_url, href).to_s
         uri = URI.parse(absolute_url)
@@ -109,6 +129,36 @@ def report_broken_target(target_url, grouped_links, issues, status)
         )
       end
 
+      def validate_nofollow_outgoing_links(links, issues)
+        links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+          target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+
+          issues.add(
+            code: :nofollow_internal_outlinks,
+            severity: :warning,
+            category: :links,
+            url: source_url,
+            message: "page has nofollow outgoing internal links",
+            details: {target_urls: target_urls}
+          )
+        end
+      end
+
+      def validate_http_internal_links(links, issues)
+        links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+          target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+
+          issues.add(
+            code: :http_internal_link,
+            severity: :warning,
+            category: :links,
+            url: source_url,
+            message: "HTTPS page links to internal HTTP URL",
+            details: {target_urls: target_urls}
+          )
+        end
+      end
+
       def report_unresolved_target(target_url, grouped_links, issues, resolution)
         source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
         suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
@@ -189,6 +239,10 @@ def ignored_error?
           resolution && status.nil? && resolution[:crawled] && resolution[:error]
         end
 
+        def html?
+          resolution && resolution[:html]
+        end
+
         def status
           resolution && resolution[:status]
         end
@@ -221,6 +275,7 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
 
           memo[path] = normalized_url
         end
+        return if sitemap_paths.size < 2
 
         html_paths = pages.each_with_object(Set.new) do |page, result|
           next unless page.html?
@@ -235,7 +290,11 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
         end
 
         inbound_anchor_counts = Hash.new(0)
+        dofollow_inbound_counts = Hash.new(0)
+        nofollow_inbound_counts = Hash.new(0)
         sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+        dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+        nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
 
         resolved_links.each do |link|
           target_path = link[:final_path]
@@ -245,24 +304,268 @@ def validate_inbound_counts(urls, pages, resolved_links, issues)
           inbound_anchor_counts[target_path] += 1
           source_samples = sample_sources_by_target[target_path]
           source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
+
+          if link[:nofollow]
+            nofollow_inbound_counts[target_path] += 1
+            nofollow_sources = nofollow_sources_by_target[target_path]
+            nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url])
+          else
+            dofollow_inbound_counts[target_path] += 1
+            dofollow_sources = dofollow_sources_by_target[target_path]
+            dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url])
+          end
         end
 
         sitemap_paths.each do |path, target_url|
           next unless html_paths.include?(path)
 
           inbound_count = inbound_anchor_counts[path]
-          next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
+          dofollow_count = dofollow_inbound_counts[path]
+          nofollow_count = nofollow_inbound_counts[path]
+
+          report_orphan_page(target_url, issues) if inbound_count.zero?
+
+          if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS
+            source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+            source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+
+            issues.add(
+              code: :low_inbound_anchor_links,
+              severity: :warning,
+              category: :links,
+              url: target_url,
+              message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
+              details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+            )
+          end
 
-          source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
-          source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+          report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues)
+          report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues)
+          report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues)
+        end
+      end
+
+      def validate_url_hygiene(urls, links, issues)
+        checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }
+        checked_urls.concat(links.map { |link| link[:target_url] })
+
+        checked_urls.compact.uniq.each do |url|
+          report_url_double_slash(url, issues)
+          report_url_too_long(url, issues)
+        end
+      end
+
+      def report_url_double_slash(url, issues)
+        path = URI.parse(url).path.to_s
+        return unless path.match?(%r{//+})
+
+        issues.add(
+          code: :url_double_slash,
+          severity: :notice,
+          category: :url,
+          url: url,
+          message: "URL path contains duplicate slashes",
+          details: {path: path}
+        )
+      rescue URI::InvalidURIError
+        nil
+      end
+
+      def report_url_too_long(url, issues)
+        return unless url.length > 2_048
+
+        issues.add(
+          code: :url_too_long,
+          severity: :notice,
+          category: :url,
+          url: url,
+          message: "URL too long (#{url.length})",
+          details: {length: url.length, maximum: 2_048}
+        )
+      end
+
+      def validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+        return if sitemap_urls.size < 2
+
+        source_paths_with_links = links.map { |link| link[:source_path] }.to_set
+
+        pages.each do |page|
+          next unless page.html?
+          next unless sitemap_urls.include?(page.normalized_url)
+
+          source_path = Url.path(page.normalized_url)
+          next unless crawlable_source_path?(source_path)
+          next if source_paths_with_links.include?(source_path)
 
           issues.add(
-            code: :low_inbound_anchor_links,
+            code: :page_has_no_outgoing_links,
             severity: :warning,
             category: :links,
-            url: target_url,
-            message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
-            details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+            url: page.url,
+            message: "page has no outgoing internal links",
+            details: {}
+          )
+        end
+      end
+
+      def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+        reported_urls = Set.new
+
+        resolved_links.each do |link|
+          final_url = link[:final_url]
+          next if sitemap_urls.include?(final_url)
+          next if reported_urls.include?(final_url)
+          next unless crawlable_path?(link[:final_path])
+
+          target = resolve_target(final_url)
+          next unless target.allowed?(@allowed_statuses) && target.html?
+
+          reported_urls << final_url
+
+          issues.add(
+            code: :indexable_page_missing_from_sitemap,
+            severity: :warning,
+            category: :sitemaps,
+            url: final_url,
+            message: "indexable internal page is missing from sitemap",
+            details: {source_url: link[:source_url]}
+          )
+        end
+      end
+
+      def report_orphan_page(target_url, issues)
+        issues.add(
+          code: :orphan_page,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "page has no incoming internal links",
+          details: {}
+        )
+      end
+
+      def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues)
+        return if dofollow_count.zero?
+        return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS
+
+        source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+        source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+
+        issues.add(
+          code: :low_dofollow_inlinks,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}",
+          details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples}
+        )
+      end
+
+      def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues)
+        return unless nofollow_count.positive? && dofollow_count.zero?
+
+        issues.add(
+          code: :only_nofollow_internal_inlinks,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "page has nofollow incoming internal links only",
+          details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)}
+        )
+      end
+
+      def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues)
+        return unless nofollow_count.positive? && dofollow_count.positive?
+
+        issues.add(
+          code: :mixed_follow_internal_inlinks,
+          severity: :notice,
+          category: :links,
+          url: target_url,
+          message: "page has nofollow and dofollow incoming internal links",
+          details: {
+            dofollow_inbound_count: dofollow_count,
+            nofollow_inbound_count: nofollow_count,
+            dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR),
+            nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)
+          }
+        )
+      end
+
+      def validate_canonical_targets(urls, pages, resolved_links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact
+        sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) }
+        return if sitemap_pages.size < 2
+
+        dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
+
+        sitemap_pages.each do |page|
+          canonical_url = canonical_url_for(page)
+          next if canonical_url.nil?
+
+          target_uri = URI.parse(canonical_url)
+          next if target_uri.host != @base_host
+
+          canonical_path = Url.path(canonical_url)
+          if canonical_path && dofollow_counts_by_path[canonical_path].zero?
+            issues.add(
+              code: :canonical_no_internal_inlinks,
+              severity: :warning,
+              category: :links,
+              url: canonical_url,
+              message: "canonical URL has no incoming internal links",
+              details: {source_url: page.url}
+            )
+          end
+
+          validate_canonical_target_status(page, canonical_url, issues)
+        rescue URI::InvalidURIError
+          next
+        end
+      end
+
+      def dofollow_counts_by_final_path(resolved_links)
+        resolved_links.each_with_object(Hash.new(0)) do |link, counts|
+          next if link[:nofollow]
+          next if link[:source_path] == link[:final_path]
+
+          counts[link[:final_path]] += 1
+        end
+      end
+
+      def canonical_url_for(page)
+        canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
+        return if canonical.empty?
+
+        Url.normalize(canonical, base_url: page.url)
+      end
+
+      def validate_canonical_target_status(page, canonical_url, issues)
+        target = resolve_target(canonical_url)
+
+        if target.unresolved? || target.ignored_error?
+          return
+        end
+
+        if target.redirect?
+          issues.add(
+            code: :canonical_points_to_redirect,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "canonical points to redirect",
+            details: {canonical: canonical_url, final_url: target.final_url, status: target.status}
+          )
+        elsif !target.allowed?(@allowed_statuses)
+          issues.add(
+            code: :canonical_points_to_error,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "canonical points to HTTP #{target.status}",
+            details: {canonical: canonical_url, status: target.status}
           )
         end
       end
diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb
index 9f261f2..883ca58 100644
--- a/lib/crawlscope/rules/metadata.rb
+++ b/lib/crawlscope/rules/metadata.rb
@@ -18,22 +18,41 @@ def initialize(site_name: nil)
       end
 
       def call(urls:, pages:, issues:, context: nil)
+        sitemap_urls = normalized_sitemap_urls(urls)
+
         pages.each do |page|
           next unless page.html?
 
           validate_h1(page, issues)
           validate_title(page, issues)
           validate_description(page, issues)
-          validate_canonical(page, issues)
+          validate_canonical(page, issues, sitemap_urls)
           validate_open_graph(page, issues)
         end
       end
 
       private
 
+      def normalized_sitemap_urls(urls)
+        urls.map { |url| Url.normalize(url, base_url: url) }.compact
+      end
+
       def validate_h1(page, issues)
         h1s = page.doc.css("h1")
-        return if h1s.one?
+        empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? }
+
+        if empty_h1s.any?
+          issues.add(
+            code: :empty_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "empty <h1>",
+            details: {count: empty_h1s.size}
+          )
+        end
+
+        return if h1s.one? && empty_h1s.empty?
 
         if h1s.empty?
           issues.add(
@@ -57,7 +76,19 @@ def validate_h1(page, issues)
       end
 
       def validate_title(page, issues)
-        title = page.doc.at_css("title")&.text.to_s.strip
+        titles = page.doc.css("head > title")
+        title = titles.first&.text.to_s.strip
+
+        if titles.size > 1
+          issues.add(
+            code: :multiple_title_tags,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple <title> tags (#{titles.size})",
+            details: {count: titles.size}
+          )
+        end
 
         if title.empty?
           issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
@@ -69,7 +100,19 @@ def validate_title(page, issues)
       end
 
       def validate_description(page, issues)
-        description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip
+        descriptions = page.doc.css('head > meta[name="description"]')
+        description = descriptions.first&.[]("content").to_s.strip
+
+        if descriptions.size > 1
+          issues.add(
+            code: :multiple_meta_descriptions,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple meta description tags (#{descriptions.size})",
+            details: {count: descriptions.size}
+          )
+        end
 
         if description.empty?
           issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
@@ -80,7 +123,7 @@ def validate_description(page, issues)
         end
       end
 
-      def validate_canonical(page, issues)
+      def validate_canonical(page, issues, sitemap_urls)
         canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
 
         if canonical.empty?
@@ -92,13 +135,25 @@ def validate_canonical(page, issues)
         normalized_page_url = Url.normalize(page.url, base_url: page.url)
         return if canonical_matches_page?(normalized_canonical, normalized_page_url)
 
+        details = {canonical: canonical}
         issues.add(
           code: :canonical_mismatch,
           severity: :warning,
           category: :metadata,
           url: page.url,
           message: "canonical mismatch (#{canonical})",
-          details: {canonical: canonical}
+          details: details
+        )
+
+        return unless sitemap_urls.include?(normalized_page_url)
+
+        issues.add(
+          code: :non_canonical_page_in_sitemap,
+          severity: :warning,
+          category: :sitemaps,
+          url: page.url,
+          message: "non-canonical page is included in sitemap",
+          details: details
         )
       end
 
diff --git a/lib/crawlscope/rules/structured_data.rb b/lib/crawlscope/rules/structured_data.rb
index 1d19719..e04cf9b 100644
--- a/lib/crawlscope/rules/structured_data.rb
+++ b/lib/crawlscope/rules/structured_data.rb
@@ -55,6 +55,8 @@ def validate_page(page, issues, schema_registry)
             next
           end
 
+          validate_type_presence(page, source, data, issues)
+
           errors = schema_registry.validate(data)
           next if errors.empty?
 
@@ -96,6 +98,35 @@ def validate_job_posting_count(page, items, issues)
         end
       end
 
+      def validate_type_presence(page, source, data, issues)
+        missing_paths = missing_type_paths(data)
+        return if missing_paths.empty?
+
+        issues.add(
+          code: :structured_data_missing_type,
+          severity: :warning,
+          category: :structured_data,
+          url: page.url,
+          message: "#{source} structured data missing @type",
+          details: {paths: missing_paths, source: source}
+        )
+      end
+
+      def missing_type_paths(data, path = "$")
+        return [] unless data.is_a?(Hash)
+
+        paths = []
+        paths << path if data["@type"].to_s.strip.empty?
+
+        if data["@graph"].is_a?(Array)
+          data["@graph"].each_with_index do |entry, index|
+            paths.concat(missing_type_paths(entry, "#{path}.@graph[#{index}]"))
+          end
+        end
+
+        paths
+      end
+
       def structured_data_types(data)
         return [] unless data.is_a?(Hash)
 
diff --git a/lib/crawlscope/rules/uniqueness.rb b/lib/crawlscope/rules/uniqueness.rb
index 0231398..95519f9 100644
--- a/lib/crawlscope/rules/uniqueness.rb
+++ b/lib/crawlscope/rules/uniqueness.rb
@@ -58,6 +58,7 @@ def summary_for(page)
 
         {
           content_fingerprint_digest: content_fingerprint_digest(page.doc),
+          canonical: page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip,
           description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
           shingles: shingles_for(tokens),
           title: page.doc.at_css("title")&.text.to_s.strip,
@@ -98,6 +99,27 @@ def validate_duplicates(page_summaries, issues)
             details: {urls: urls}
           )
         end
+
+        duplicate_content_clusters_without_canonical(page_summaries).each do |urls|
+          issues.add(
+            code: :duplicate_pages_without_canonical,
+            severity: :warning,
+            category: :uniqueness,
+            url: nil,
+            message: "duplicate pages without canonical => #{urls.join(", ")}",
+            details: {urls: urls}
+          )
+        end
+      end
+
+      def duplicate_content_clusters_without_canonical(page_summaries)
+        page_summaries
+          .select { |page| !page[:content_fingerprint_digest].nil? }
+          .group_by { |page| page[:content_fingerprint_digest] }
+          .values
+          .select { |pages| pages.size > 1 }
+          .select { |pages| pages.any? { |page| page[:canonical].to_s.empty? } }
+          .map { |pages| pages.map { |page| page[:url] } }
       end
 
       def shingles_for(tokens)
diff --git a/lib/crawlscope/sitemap.rb b/lib/crawlscope/sitemap.rb
index 38e86da..b2b1eef 100644
--- a/lib/crawlscope/sitemap.rb
+++ b/lib/crawlscope/sitemap.rb
@@ -25,6 +25,9 @@ def collect_urls(source, base_url:, visited:)
       visited.add(source)
       document = Nokogiri::XML(read(source))
       root_name = document.root&.name
+      unless %w[sitemapindex urlset].include?(root_name)
+        raise ValidationError, "Sitemap #{source} has unexpected root #{root_name.inspect}"
+      end
 
       if root_name == "sitemapindex"
         document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
@@ -40,7 +43,12 @@ def collect_urls(source, base_url:, visited:)
 
     def read(source)
       if Url.remote?(source)
-        connection.get(source).body
+        response = connection.get(source)
+        unless response.status.to_i.between?(200, 299)
+          raise ValidationError, "Sitemap #{source} returned HTTP #{response.status}"
+        end
+
+        response.body
       else
         File.read(source)
       end
diff --git a/lib/tasks/crawlscope_tasks.rake b/lib/tasks/crawlscope_tasks.rake
index 046475f..80ad80c 100644
--- a/lib/tasks/crawlscope_tasks.rake
+++ b/lib/tasks/crawlscope_tasks.rake
@@ -1,43 +1,43 @@
 namespace :crawlscope do
-  desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
-  task validate: :environment do
-    Crawlscope::RakeTasks.validate
+  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
+    Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
   end
 
   namespace :validate do
-    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
-    task ldjson: :environment do
-      Crawlscope::RakeTasks.ldjson
+    desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    task :ldjson, [:url] => :environment do |_task, args|
+      Crawlscope::RakeTasks.ldjson(urls: args[:url])
     end
 
-    desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1"
-    task indexability: :environment do
-      Crawlscope::RakeTasks.validate_rule("indexability")
+    desc "Validate URLs with the indexability rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :indexability, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("indexability", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
-    task metadata: :environment do
-      Crawlscope::RakeTasks.validate_rule("metadata")
+    desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :metadata, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
-    task structured_data: :environment do
-      Crawlscope::RakeTasks.validate_rule("structured_data")
+    desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :structured_data, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
-    task uniqueness: :environment do
-      Crawlscope::RakeTasks.validate_rule("uniqueness")
+    desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :uniqueness, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1"
-    task content_quality: :environment do
-      Crawlscope::RakeTasks.validate_rule("content_quality")
+    desc "Validate URLs with the content_quality rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :content_quality, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("content_quality", url: args[:url], sitemap_path: args[:sitemap])
     end
 
-    desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
-    task links: :environment do
-      Crawlscope::RakeTasks.validate_rule("links")
+    desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
+    task :links, [:url, :sitemap] => :environment do |_task, args|
+      Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
     end
   end
 end
diff --git a/test/crawlscope/cli_test.rb b/test/crawlscope/cli_test.rb
index 6f876d2..b520755 100644
--- a/test/crawlscope/cli_test.rb
+++ b/test/crawlscope/cli_test.rb
@@ -265,6 +265,7 @@ def test_validation_errors_return_failed_status_without_reraising
 
     assert_equal 1, status
     assert_includes err.string, "No URLs found in sitemap"
+    refute_includes err.string, "Usage:"
   end
 
   private
diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb
index 4b21191..6940107 100644
--- a/test/crawlscope/crawl_test.rb
+++ b/test/crawlscope/crawl_test.rb
@@ -188,4 +188,30 @@ def fetch(url)
     assert_equal ["https://example.com/pricing"], fake_browser.urls
     assert fake_browser.closed
   end
+
+  def test_reports_sitemap_redirect_url
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/old</loc></url>
+        </urlset>
+      XML
+    )
+
+    stub_request(:get, "https://example.com/old")
+      .to_return(status: 301, headers: {"Location" => "https://example.com/new"}, body: "")
+    stub_request(:get, "https://example.com/new")
+      .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Moved</body></html>")
+
+    result = Crawlscope::Crawl.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: [],
+      schema_registry: Crawlscope::SchemaRegistry.default
+    ).call
+
+    assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
+  end
 end
diff --git a/test/crawlscope/indexability_rule_test.rb b/test/crawlscope/indexability_rule_test.rb
index c5353a0..7174060 100644
--- a/test/crawlscope/indexability_rule_test.rb
+++ b/test/crawlscope/indexability_rule_test.rb
@@ -20,6 +20,39 @@ def test_reports_meta_noindex
     assert_equal :noindex_meta, issue.code
     assert_equal :error, issue.severity
     assert_equal "noindex, follow", issue.details[:content]
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :noindex_follow_meta
+    assert_includes codes, :sitemap_noindex_url
+  end
+
+  def test_reports_meta_nofollow
+    issues = Crawlscope::IssueCollection.new
+    page = page_with(
+      body: <<~HTML
+        <html>
+          <head><meta name="robots" content="nofollow"></head>
+          <body><main>Visible content</main></body>
+        </html>
+      HTML
+    )
+
+    Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
+
+    assert_equal [:nofollow_meta], issues.to_a.map(&:code)
+  end
+
+  def test_reports_noindex_nofollow_header
+    issues = Crawlscope::IssueCollection.new
+    page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"})
+
+    Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :noindex_header
+    assert_includes codes, :nofollow_header
+    assert_includes codes, :noindex_nofollow_header
+    assert_includes codes, :sitemap_noindex_url
   end
 
   def test_reports_x_robots_tag_noindex
diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
index 8bb8617..6873ca3 100644
--- a/test/crawlscope/links_rule_test.rb
+++ b/test/crawlscope/links_rule_test.rb
@@ -41,7 +41,7 @@ def test_reports_broken_internal_links
       context: context
     )
 
-    assert_equal [:broken_internal_link], issues.to_a.map(&:code)
+    assert_includes issues.to_a.map(&:code), :broken_internal_link
     assert_includes issues.to_a.first.message, "HTTP 404"
   end
 
@@ -114,8 +114,151 @@ def test_reports_low_inbound_anchor_links
       context: context
     )
 
-    assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
-    assert_equal "https://example.com/guide", issues.to_a.first.url
+    orphan_issue = issues.to_a.find { |item| item.code == :orphan_page }
+    assert orphan_issue
+    assert_includes issues.to_a.map(&:code), :low_dofollow_inlinks
+    assert_equal "https://example.com/guide", orphan_issue.url
+  end
+
+  def test_reports_pages_with_no_outgoing_internal_links
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide", "https://example.com/pricing"],
+      pages: [
+        page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
+        page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
+      ],
+      issues: issues,
+      context: context
+    )
+
+    issue = issues.to_a.find { |item| item.code == :page_has_no_outgoing_links }
+    assert issue
+    assert_equal "https://example.com/pricing", issue.url
+  end
+
+  def test_reports_nofollow_outlinks_and_inlink_follow_mix
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide", "https://example.com/pricing", "https://example.com/about"],
+      pages: [
+        page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a><a href=\"/about\">About</a></main>"),
+        page(url: "https://example.com/about", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
+        page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
+      ],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+    )
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :nofollow_internal_outlinks
+    assert_includes codes, :mixed_follow_internal_inlinks
+  end
+
+  def test_reports_only_nofollow_internal_inlinks
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide", "https://example.com/pricing"],
+      pages: [
+        page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a></main>"),
+        page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
+      ],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+    )
+
+    assert_includes issues.to_a.map(&:code), :only_nofollow_internal_inlinks
+  end
+
+  def test_reports_https_pages_linking_to_internal_http_urls
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"http://example.com/pricing\">Pricing</a></main>")],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+    )
+
+    assert_includes issues.to_a.map(&:code), :http_internal_link
+  end
+
+  def test_reports_canonical_target_link_issues
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      redirects = target_url == "https://example.com/canonical-about"
+      status = redirects ? 301 : 200
+      final_url = redirects ? "https://example.com/about" : target_url
+      {crawled: false, error: nil, final_url: final_url, status: status}
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide", "https://example.com/about"],
+      pages: [
+        page(url: "https://example.com/guide", body: "<main><a href=\"/about\">About</a></main>"),
+        page(
+          url: "https://example.com/about",
+          body: <<~HTML
+            <html>
+              <head><link rel="canonical" href="https://example.com/canonical-about"></head>
+              <body><main><p>About</p></main></body>
+            </html>
+          HTML
+        )
+      ],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :canonical_no_internal_inlinks
+    assert_includes codes, :canonical_points_to_redirect
+  end
+
+  def test_reports_indexable_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        error: nil,
+        final_url: target_url,
+        html: true,
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    issue = issues.to_a.find { |item| item.code == :indexable_page_missing_from_sitemap }
+    assert issue
+    assert_equal "https://example.com/hidden", issue.url
+  end
+
+  def test_reports_url_hygiene_issues
+    issues = Crawlscope::IssueCollection.new
+    long_path = "a" * 2_050
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com//bad", "https://example.com/#{long_path}"],
+      pages: [
+        page(url: "https://example.com//bad", body: "<main><a href=\"/ok\">OK</a></main>"),
+        page(url: "https://example.com/#{long_path}", body: "<main><a href=\"/ok\">OK</a></main>")
+      ],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: false, error: nil, final_url: target_url, html: true, status: 200} })
+    )
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :url_double_slash
+    assert_includes codes, :url_too_long
   end
 
   def test_counts_root_page_links_as_inbound_links
@@ -217,6 +360,7 @@ def resolve_target(target_url)
         crawled: true,
         error: nil,
         final_url: target_url,
+        html: true,
         status: 200
       }
     when "https://example.com/missing"
@@ -224,6 +368,7 @@ def resolve_target(target_url)
         crawled: false,
         error: nil,
         final_url: target_url,
+        html: false,
         status: 404
       }
     end
diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb
index 694404f..6715c4f 100644
--- a/test/crawlscope/metadata_rule_test.rb
+++ b/test/crawlscope/metadata_rule_test.rb
@@ -48,6 +48,42 @@ def test_allows_localhost_page_with_matching_production_canonical_path
     refute_includes issues.to_a.map(&:code), :canonical_mismatch
   end
 
+  def test_reports_multiple_title_multiple_descriptions_empty_h1_and_sitemap_canonical_mismatch
+    issues = Crawlscope::IssueCollection.new
+    invalid_page = page(
+      body: <<~HTML
+        <html>
+          <head>
+            <title>About</title>
+            <title>Duplicate About</title>
+            <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
+            <meta name="description" content="Duplicate description">
+            <link rel="canonical" href="https://example.com/canonical-about">
+            <meta property="og:title" content="About">
+            <meta property="og:description" content="About page">
+            <meta property="og:url" content="https://example.com/about">
+            <meta property="og:type" content="website">
+            <meta property="og:image" content="https://example.com/icon.png">
+          </head>
+          <body><main><h1> </h1></main></body>
+        </html>
+      HTML
+    )
+
+    Crawlscope::Rules::Metadata.new.call(
+      urls: [invalid_page.url],
+      pages: [invalid_page],
+      issues: issues
+    )
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :multiple_title_tags
+    assert_includes codes, :multiple_meta_descriptions
+    assert_includes codes, :empty_h1
+    assert_includes codes, :canonical_mismatch
+    assert_includes codes, :non_canonical_page_in_sitemap
+  end
+
   private
 
   def page(url: "https://example.com/about", body: nil)
diff --git a/test/crawlscope/rake_tasks_test.rb b/test/crawlscope/rake_tasks_test.rb
new file mode 100644
index 0000000..e33d829
--- /dev/null
+++ b/test/crawlscope/rake_tasks_test.rb
@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+
+require "test_helper"
+
+class CrawlscopeRakeTasksTest < Minitest::Test
+  def setup
+    @original_start = Crawlscope::Cli.method(:start)
+  end
+
+  def teardown
+    singleton_class = class << Crawlscope::Cli; self; end
+    original_start = @original_start
+    singleton_class.define_method(:start) do |*args, **kwargs|
+      original_start.call(*args, **kwargs)
+    end
+  end
+
+  def test_validate_passes_rake_arguments_to_cli
+    calls = capture_cli_calls
+
+    Crawlscope::RakeTasks.validate(
+      url: "http://localhost:3001",
+      sitemap_path: "http://localhost:3001/sitemap.xml",
+      rule_names: "metadata,links"
+    )
+
+    assert_equal(
+      ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata,links"],
+      calls.fetch(0).fetch(:argv)
+    )
+  end
+
+  def test_validate_rule_passes_rule_and_rake_arguments_to_cli
+    calls = capture_cli_calls
+
+    Crawlscope::RakeTasks.validate_rule(
+      "metadata",
+      url: "http://localhost:3001",
+      sitemap_path: "http://localhost:3001/sitemap.xml"
+    )
+
+    assert_equal(
+      ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata"],
+      calls.fetch(0).fetch(:argv)
+    )
+  end
+
+  def test_ldjson_passes_rake_url_argument_to_cli
+    calls = capture_cli_calls
+
+    Crawlscope::RakeTasks.ldjson(urls: "http://localhost:3001/article")
+
+    assert_equal(
+      ["ldjson", "--url", "http://localhost:3001/article"],
+      calls.fetch(0).fetch(:argv)
+    )
+  end
+
+  private
+
+  def capture_cli_calls
+    calls = []
+    singleton_class = class << Crawlscope::Cli; self; end
+    singleton_class.define_method(:start) do |argv, **kwargs|
+      calls << {argv: argv, kwargs: kwargs}
+      0
+    end
+    calls
+  end
+end
diff --git a/test/crawlscope/reporter_test.rb b/test/crawlscope/reporter_test.rb
index 9b2d8b2..2fcf75c 100644
--- a/test/crawlscope/reporter_test.rb
+++ b/test/crawlscope/reporter_test.rb
@@ -23,7 +23,7 @@ def test_reports_ok_result
     refute_includes output, "Status: FAILED"
   end
 
-  def test_reports_failed_result_with_severity_counts
+  def test_reports_failed_result_with_grouped_counts_and_offenses
     io = StringIO.new
     issues = Crawlscope::IssueCollection.new
     issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
@@ -42,9 +42,13 @@ def test_reports_failed_result_with_severity_counts
 
     assert_includes output, "Status: FAILED"
     assert_includes output, "Issues: 2"
+    assert_includes output, "Severity:"
     assert_includes output, "notice: 1"
     assert_includes output, "warning: 1"
-    assert_includes output, "- [warning] https://example.com/a missing <title>"
-    assert_includes output, "- [notice] https://example.com/b broken internal link"
+    assert_includes output, "Category:"
+    assert_includes output, "links: 1"
+    assert_includes output, "metadata: 1"
+    assert_includes output, "  - [warning] missing_title https://example.com/a missing <title>"
+    assert_includes output, "  - [notice] broken_internal_link https://example.com/b broken internal link"
   end
 end
diff --git a/test/crawlscope/sitemap_test.rb b/test/crawlscope/sitemap_test.rb
index e277b1d..33c8151 100644
--- a/test/crawlscope/sitemap_test.rb
+++ b/test/crawlscope/sitemap_test.rb
@@ -49,6 +49,30 @@ def test_parses_remote_sitemap_index_with_child_sitemap
     assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
   end
 
+  def test_remote_sitemap_http_error_is_explicit
+    stub_request(:get, "https://www.example.com/sitemap.xml")
+      .to_return(status: 500, body: "<html><body>Error</body></html>")
+
+    parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
+
+    error = assert_raises(Crawlscope::ValidationError) do
+      parser.urls(base_url: "https://www.example.com")
+    end
+    assert_equal "Sitemap https://www.example.com/sitemap.xml returned HTTP 500", error.message
+  end
+
+  def test_invalid_sitemap_root_is_explicit
+    stub_request(:get, "https://www.example.com/sitemap.xml")
+      .to_return(status: 200, body: "<html><body>Error</body></html>")
+
+    parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
+
+    error = assert_raises(Crawlscope::ValidationError) do
+      parser.urls(base_url: "https://www.example.com")
+    end
+    assert_equal 'Sitemap https://www.example.com/sitemap.xml has unexpected root "html"', error.message
+  end
+
   def test_rebases_remote_sitemap_index_children_to_base_url
     stub_request(:get, "http://localhost:3000/sitemap.xml")
       .to_return(
diff --git a/test/crawlscope/structured_data_rule_test.rb b/test/crawlscope/structured_data_rule_test.rb
index dd9e10c..fc5959c 100644
--- a/test/crawlscope/structured_data_rule_test.rb
+++ b/test/crawlscope/structured_data_rule_test.rb
@@ -79,6 +79,62 @@ def test_reports_missing_structured_data_for_html_pages
     assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
   end
 
+  def test_reports_structured_data_missing_type
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::StructuredData.new
+    page = page(
+      url: "https://example.com/articles/test",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {"@context":"https://schema.org","headline":"Untyped article"}
+            </script>
+          </head>
+          <body><h1>Article</h1></body>
+        </html>
+      HTML
+    )
+
+    rule.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues,
+      context: {schema_registry: Crawlscope::SchemaRegistry.default}
+    )
+
+    assert_includes issues.to_a.map(&:code), :structured_data_missing_type
+  end
+
+  def test_reports_graph_entries_missing_type
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::StructuredData.new
+    page = page(
+      url: "https://example.com/articles/test",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {"@context":"https://schema.org","@type":"WebPage","@graph":[{"name":"Untyped node"}]}
+            </script>
+          </head>
+          <body><h1>Article</h1></body>
+        </html>
+      HTML
+    )
+
+    rule.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues,
+      context: {schema_registry: Crawlscope::SchemaRegistry.default}
+    )
+
+    issue = issues.to_a.find { |item| item.code == :structured_data_missing_type }
+    assert issue
+    assert_equal ["$.@graph[0]"], issue.details[:paths]
+  end
+
   def test_validates_job_posting_markup
     issues = Crawlscope::IssueCollection.new
     rule = Crawlscope::Rules::StructuredData.new
diff --git a/test/crawlscope/uniqueness_rule_test.rb b/test/crawlscope/uniqueness_rule_test.rb
index c8f958d..c770353 100644
--- a/test/crawlscope/uniqueness_rule_test.rb
+++ b/test/crawlscope/uniqueness_rule_test.rb
@@ -13,7 +13,20 @@ def test_reports_duplicate_title_description_and_content
 
     rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
 
-    assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort
+    assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_pages_without_canonical duplicate_title].sort, issues.to_a.map(&:code).sort
+  end
+
+  def test_allows_duplicate_pages_when_canonicals_are_present
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::Uniqueness.new
+    pages = [
+      page(url: "https://example.com/a", canonical: "https://example.com/a"),
+      page(url: "https://example.com/b", canonical: "https://example.com/a")
+    ]
+
+    rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
+
+    refute_includes issues.to_a.map(&:code), :duplicate_pages_without_canonical
   end
 
   def test_reports_near_duplicate_content
@@ -59,13 +72,15 @@ def near_duplicate_content(adjective)
     TEXT
   end
 
-  def page(url:, content: nil)
+  def page(url:, content: nil, canonical: nil)
     repeated_text = content || ("Useful content " * 30).strip
+    canonical_tag = canonical ? %(<link rel="canonical" href="#{canonical}">) : ""
     body = <<~HTML
       <html>
         <head>
           <title>Example Title</title>
           <meta name="description" content="Example description">
+          #{canonical_tag}
         </head>
         <body>
           <main>#{repeated_text}</main>