From 4198d0ce008039b23c1478a8862c0d2b3679f67e Mon Sep 17 00:00:00 2001 From: Arber Xhindoli <14798762+arberx@users.noreply.github.com> Date: Wed, 17 Jun 2026 10:17:46 -0400 Subject: [PATCH] fix: XML-entity-decode sitemap URLs (#50) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spec-compliant sitemaps escape `&` in `` URLs as `&` per sitemaps.org. parseSitemapXml passed the literal `...&...` to the fetcher, which the origin treats as a different (usually empty) request. On a sitemap index — where every child carries query params (BigCommerce, paginated CMS sitemaps) — every child fetch failed and the audit aborted with "No auditable URLs found in sitemap."; flat pages were silently dropped. Both the urlset and sitemapindex branches now decode the five predefined XML entities plus decimal/hex numeric character references (& last, out-of-range refs left untouched so a malformed sitemap can't throw). Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 5 ++++ package.json | 2 +- src/sitemap.ts | 32 ++++++++++++++++++++++++-- test/sitemap.test.ts | 54 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62c0072..1c96238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 4.0.1 (2026-06-17) + +### Fixed +- **Sitemap `` URLs are now XML-entity-decoded (issue #50).** Per the [sitemaps.org spec](https://www.sitemaps.org/protocol.html#escaping), a `&` inside a URL must be written `&`, so any spec-compliant `` with a multi-param query string (`?type=pages&page=1`) arrives entity-escaped. `parseSitemapXml` previously passed the literal `...&...` to the fetcher, which the origin treats as a different (usually empty) request. On a **sitemap index** — where every child `` carries query params (BigCommerce, many paginated CMS sitemaps) — every child fetch failed and the audit aborted with `BAD_INPUT: No auditable URLs found in sitemap.`; on a flat `` the affected pages were silently dropped. Both the `urlset` and `sitemapindex` branches now decode the five predefined XML entities plus decimal/hex numeric character references (with `&` resolved last). No API or scoring change. + ## 4.0.0 (2026-06-09) ### Breaking diff --git a/package.json b/package.json index c39b67a..1fb3508 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ainyc/aeo-audit", - "version": "4.0.0", + "version": "4.0.1", "description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 16 ranking factors that determine AI citation.", "type": "module", "main": "./dist/index.js", diff --git a/src/sitemap.ts b/src/sitemap.ts index df236b7..600cdaf 100644 --- a/src/sitemap.ts +++ b/src/sitemap.ts @@ -58,6 +58,34 @@ interface SitemapEntry { priority?: number } +/** + * Decode the five predefined XML entities plus numeric character references in a + * `` value. Per the sitemaps.org spec (#escaping), a `&` inside a URL MUST be + * written `&`, so a spec-compliant `` with a multi-param query string + * (`?type=pages&page=1`) arrives entity-escaped. Without decoding, the fetcher + * requests the literal `...&...`, which the origin treats as a different + * request — on a sitemap index every child fetch then fails and the audit returns + * zero URLs (issue #50). `&` is replaced LAST so `&lt;` decodes to the + * literal `<`, not `<`. Out-of-range numeric refs are left untouched rather than + * throwing, so a malformed sitemap never aborts the whole audit. + */ +function decodeXmlEntities(value: string): string { + return value + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#(\d+);/g, (match, dec) => codePointToChar(Number(dec), match)) + .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => codePointToChar(parseInt(hex, 16), match)) + .replace(/&/g, '&') +} + +function codePointToChar(codePoint: number, original: string): string { + return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10ffff + ? String.fromCodePoint(codePoint) + : original +} + function parseSitemapXml(xml: string): SitemapEntry[] { const entries: SitemapEntry[] = [] @@ -69,7 +97,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] { const locMatch = block.match(/]*>([\s\S]*?)<\/loc>/i) if (!locMatch) continue - const loc = locMatch[1].trim() + const loc = decodeXmlEntities(locMatch[1].trim()) if (!loc) continue const priorityMatch = block.match(/]*>([\s\S]*?)<\/priority>/i) @@ -83,7 +111,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] { const sitemapLocRe = /]*>[\s\S]*?]*>([\s\S]*?)<\/loc>[\s\S]*?<\/sitemap>/gi let sitemapMatch while ((sitemapMatch = sitemapLocRe.exec(xml)) !== null) { - entries.push({ loc: sitemapMatch[1].trim() }) + entries.push({ loc: decodeXmlEntities(sitemapMatch[1].trim()) }) } } diff --git a/test/sitemap.test.ts b/test/sitemap.test.ts index 0a92a91..129607e 100644 --- a/test/sitemap.test.ts +++ b/test/sitemap.test.ts @@ -45,6 +45,60 @@ test('parseSitemapXml handles sitemap index files', () => { expect(entries[1].loc).toBe('https://example.com/sitemap-pages.xml') }) +test('parseSitemapXml decodes XML-escaped ampersands in urlset query params (issue #50)', () => { + // Per sitemaps.org, `&` in a URL must be written `&`. Without decoding, the + // fetcher requests the literal `...&...`, which the origin treats as a + // different (usually empty) request, silently dropping the page. + const xml = ` + + + https://example.com/search?type=pages&page=1&lang=en + +` + + const entries = parseSitemapXml(xml) + expect(entries).toHaveLength(1) + expect(entries[0].loc).toBe('https://example.com/search?type=pages&page=1&lang=en') +}) + +test('parseSitemapXml decodes XML-escaped ampersands in sitemapindex child (issue #50)', () => { + // The BigCommerce repro: every child sitemap of the index carries `&` query + // params, so without decoding all child fetches fail and the audit returns zero + // URLs ("No auditable URLs found in sitemap."). + const xml = ` + + + https://example.com/xmlsitemap.php?type=pages&page=1 + + + https://example.com/xmlsitemap.php?type=products&page=1 + +` + + const entries = parseSitemapXml(xml) + expect(entries).toHaveLength(2) + expect(entries[0].loc).toBe('https://example.com/xmlsitemap.php?type=pages&page=1') + expect(entries[1].loc).toBe('https://example.com/xmlsitemap.php?type=products&page=1') +}) + +test('parseSitemapXml decodes numeric and named character references, with & applied last', () => { + const xml = ` + + + https://example.com/q?a=1&b=2&c=3&d=4 + + + https://example.com/café + +` + + const entries = parseSitemapXml(xml) + expect(entries).toHaveLength(2) + // &, & (decimal) and & (hex) all resolve to a literal ampersand. + expect(entries[0].loc).toBe('https://example.com/q?a=1&b=2&c=3&d=4') + expect(entries[1].loc).toBe('https://example.com/café') +}) + test('shouldSkipUrl filters non-HTML URLs', () => { expect(shouldSkipUrl('https://example.com/doc.pdf')).toBe(true) expect(shouldSkipUrl('https://example.com/image.png')).toBe(true)