From 4198d0ce008039b23c1478a8862c0d2b3679f67e Mon Sep 17 00:00:00 2001
From: Arber Xhindoli <14798762+arberx@users.noreply.github.com>
Date: Wed, 17 Jun 2026 10:17:46 -0400
Subject: [PATCH] fix: XML-entity-decode sitemap <loc> URLs (#50)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Spec-compliant sitemaps escape `&` in `<loc>` URLs as `&amp;` per
sitemaps.org. parseSitemapXml passed the literal `...&amp;...` to the
fetcher, which the origin treats as a different (usually empty) request.
On a sitemap index — where every child <loc> carries query params
(BigCommerce, paginated CMS sitemaps) — every child fetch failed and the
audit aborted with "No auditable URLs found in sitemap."; flat <urlset>
pages were silently dropped.

Both the urlset and sitemapindex branches now decode the five predefined
XML entities plus decimal/hex numeric character references (&amp; last,
out-of-range refs left untouched so a malformed sitemap can't throw).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md         |  5 ++++
 package.json         |  2 +-
 src/sitemap.ts       | 32 ++++++++++++++++++++++++--
 test/sitemap.test.ts | 54 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62c0072..1c96238 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## 4.0.1 (2026-06-17)
+
+### Fixed
+- **Sitemap `<loc>` URLs are now XML-entity-decoded (issue #50).** Per the [sitemaps.org spec](https://www.sitemaps.org/protocol.html#escaping), a `&` inside a URL must be written `&amp;`, so any spec-compliant `<loc>` with a multi-param query string (`?type=pages&amp;page=1`) arrives entity-escaped. `parseSitemapXml` previously passed the literal `...&amp;...` to the fetcher, which the origin treats as a different (usually empty) request. On a **sitemap index** — where every child `<loc>` carries query params (BigCommerce, many paginated CMS sitemaps) — every child fetch failed and the audit aborted with `BAD_INPUT: No auditable URLs found in sitemap.`; on a flat `<urlset>` the affected pages were silently dropped. Both the `urlset` and `sitemapindex` branches now decode the five predefined XML entities plus decimal/hex numeric character references (with `&amp;` resolved last). No API or scoring change.
+
 ## 4.0.0 (2026-06-09)
 
 ### Breaking
diff --git a/package.json b/package.json
index c39b67a..1fb3508 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@ainyc/aeo-audit",
-  "version": "4.0.0",
+  "version": "4.0.1",
   "description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 16 ranking factors that determine AI citation.",
   "type": "module",
   "main": "./dist/index.js",
diff --git a/src/sitemap.ts b/src/sitemap.ts
index df236b7..600cdaf 100644
--- a/src/sitemap.ts
+++ b/src/sitemap.ts
@@ -58,6 +58,34 @@ interface SitemapEntry {
   priority?: number
 }
 
+/**
+ * Decode the five predefined XML entities plus numeric character references in a
+ * `<loc>` value. Per the sitemaps.org spec (#escaping), a `&` inside a URL MUST be
+ * written `&amp;`, so a spec-compliant `<loc>` with a multi-param query string
+ * (`?type=pages&amp;page=1`) arrives entity-escaped. Without decoding, the fetcher
+ * requests the literal `...&amp;...`, which the origin treats as a different
+ * request — on a sitemap index every child fetch then fails and the audit returns
+ * zero URLs (issue #50). `&amp;` is replaced LAST so `&amp;lt;` decodes to the
+ * literal `&lt;`, not `<`. Out-of-range numeric refs are left untouched rather than
+ * throwing, so a malformed sitemap never aborts the whole audit.
+ */
+function decodeXmlEntities(value: string): string {
+  return value
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&apos;/g, "'")
+    .replace(/&#(\d+);/g, (match, dec) => codePointToChar(Number(dec), match))
+    .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => codePointToChar(parseInt(hex, 16), match))
+    .replace(/&amp;/g, '&')
+}
+
+function codePointToChar(codePoint: number, original: string): string {
+  return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10ffff
+    ? String.fromCodePoint(codePoint)
+    : original
+}
+
 function parseSitemapXml(xml: string): SitemapEntry[] {
   const entries: SitemapEntry[] = []
 
@@ -69,7 +97,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] {
     const locMatch = block.match(/<loc\b[^>]*>([\s\S]*?)<\/loc>/i)
     if (!locMatch) continue
 
-    const loc = locMatch[1].trim()
+    const loc = decodeXmlEntities(locMatch[1].trim())
     if (!loc) continue
 
     const priorityMatch = block.match(/<priority\b[^>]*>([\s\S]*?)<\/priority>/i)
@@ -83,7 +111,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] {
     const sitemapLocRe = /<sitemap\b[^>]*>[\s\S]*?<loc\b[^>]*>([\s\S]*?)<\/loc>[\s\S]*?<\/sitemap>/gi
     let sitemapMatch
     while ((sitemapMatch = sitemapLocRe.exec(xml)) !== null) {
-      entries.push({ loc: sitemapMatch[1].trim() })
+      entries.push({ loc: decodeXmlEntities(sitemapMatch[1].trim()) })
     }
   }
 
diff --git a/test/sitemap.test.ts b/test/sitemap.test.ts
index 0a92a91..129607e 100644
--- a/test/sitemap.test.ts
+++ b/test/sitemap.test.ts
@@ -45,6 +45,60 @@ test('parseSitemapXml handles sitemap index files', () => {
   expect(entries[1].loc).toBe('https://example.com/sitemap-pages.xml')
 })
 
+test('parseSitemapXml decodes XML-escaped ampersands in urlset <loc> query params (issue #50)', () => {
+  // Per sitemaps.org, `&` in a URL must be written `&amp;`. Without decoding, the
+  // fetcher requests the literal `...&amp;...`, which the origin treats as a
+  // different (usually empty) request, silently dropping the page.
+  const xml = `<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/search?type=pages&amp;page=1&amp;lang=en</loc>
+  </url>
+</urlset>`
+
+  const entries = parseSitemapXml(xml)
+  expect(entries).toHaveLength(1)
+  expect(entries[0].loc).toBe('https://example.com/search?type=pages&page=1&lang=en')
+})
+
+test('parseSitemapXml decodes XML-escaped ampersands in sitemapindex child <loc> (issue #50)', () => {
+  // The BigCommerce repro: every child sitemap of the index carries `&amp;` query
+  // params, so without decoding all child fetches fail and the audit returns zero
+  // URLs ("No auditable URLs found in sitemap.").
+  const xml = `<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap>
+    <loc>https://example.com/xmlsitemap.php?type=pages&amp;page=1</loc>
+  </sitemap>
+  <sitemap>
+    <loc>https://example.com/xmlsitemap.php?type=products&amp;page=1</loc>
+  </sitemap>
+</sitemapindex>`
+
+  const entries = parseSitemapXml(xml)
+  expect(entries).toHaveLength(2)
+  expect(entries[0].loc).toBe('https://example.com/xmlsitemap.php?type=pages&page=1')
+  expect(entries[1].loc).toBe('https://example.com/xmlsitemap.php?type=products&page=1')
+})
+
+test('parseSitemapXml decodes numeric and named character references, with &amp; applied last', () => {
+  const xml = `<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/q?a=1&amp;b=2&#38;c=3&#x26;d=4</loc>
+  </url>
+  <url>
+    <loc>https://example.com/caf&#233;</loc>
+  </url>
+</urlset>`
+
+  const entries = parseSitemapXml(xml)
+  expect(entries).toHaveLength(2)
+  // &amp;, &#38; (decimal) and &#x26; (hex) all resolve to a literal ampersand.
+  expect(entries[0].loc).toBe('https://example.com/q?a=1&b=2&c=3&d=4')
+  expect(entries[1].loc).toBe('https://example.com/café')
+})
+
 test('shouldSkipUrl filters non-HTML URLs', () => {
   expect(shouldSkipUrl('https://example.com/doc.pdf')).toBe(true)
   expect(shouldSkipUrl('https://example.com/image.png')).toBe(true)