Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## 4.0.1 (2026-06-17)

### Fixed
- **Sitemap `<loc>` URLs are now XML-entity-decoded (issue #50).** Per the [sitemaps.org spec](https://www.sitemaps.org/protocol.html#escaping), a `&` inside a URL must be written `&amp;`, so any spec-compliant `<loc>` with a multi-param query string (`?type=pages&amp;page=1`) arrives entity-escaped. `parseSitemapXml` previously passed the literal `...&amp;...` to the fetcher, which the origin treats as a different (usually empty) request. On a **sitemap index** — where every child `<loc>` carries query params (BigCommerce, many paginated CMS sitemaps) — every child fetch failed and the audit aborted with `BAD_INPUT: No auditable URLs found in sitemap.`; on a flat `<urlset>` the affected pages were silently dropped. Both the `urlset` and `sitemapindex` branches now decode the five predefined XML entities plus decimal/hex numeric character references (with `&amp;` resolved last). No API or scoring change.

## 4.0.0 (2026-06-09)

### Breaking
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@ainyc/aeo-audit",
"version": "4.0.0",
"version": "4.0.1",
"description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 16 ranking factors that determine AI citation.",
"type": "module",
"main": "./dist/index.js",
Expand Down
32 changes: 30 additions & 2 deletions src/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,34 @@ interface SitemapEntry {
priority?: number
}

/**
* Decode the five predefined XML entities plus numeric character references in a
* `<loc>` value. Per the sitemaps.org spec (#escaping), a `&` inside a URL MUST be
* written `&amp;`, so a spec-compliant `<loc>` with a multi-param query string
* (`?type=pages&amp;page=1`) arrives entity-escaped. Without decoding, the fetcher
* requests the literal `...&amp;...`, which the origin treats as a different
* request — on a sitemap index every child fetch then fails and the audit returns
* zero URLs (issue #50). `&amp;` is replaced LAST so `&amp;lt;` decodes to the
* literal `&lt;`, not `<`. Out-of-range numeric refs are left untouched rather than
* throwing, so a malformed sitemap never aborts the whole audit.
*/
function decodeXmlEntities(value: string): string {
return value
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&#(\d+);/g, (match, dec) => codePointToChar(Number(dec), match))
.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => codePointToChar(parseInt(hex, 16), match))
.replace(/&amp;/g, '&')
}

function codePointToChar(codePoint: number, original: string): string {
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10ffff
? String.fromCodePoint(codePoint)
: original
}

function parseSitemapXml(xml: string): SitemapEntry[] {
const entries: SitemapEntry[] = []

Expand All @@ -69,7 +97,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] {
const locMatch = block.match(/<loc\b[^>]*>([\s\S]*?)<\/loc>/i)
if (!locMatch) continue

const loc = locMatch[1].trim()
const loc = decodeXmlEntities(locMatch[1].trim())
if (!loc) continue

const priorityMatch = block.match(/<priority\b[^>]*>([\s\S]*?)<\/priority>/i)
Expand All @@ -83,7 +111,7 @@ function parseSitemapXml(xml: string): SitemapEntry[] {
const sitemapLocRe = /<sitemap\b[^>]*>[\s\S]*?<loc\b[^>]*>([\s\S]*?)<\/loc>[\s\S]*?<\/sitemap>/gi
let sitemapMatch
while ((sitemapMatch = sitemapLocRe.exec(xml)) !== null) {
entries.push({ loc: sitemapMatch[1].trim() })
entries.push({ loc: decodeXmlEntities(sitemapMatch[1].trim()) })
}
}

Expand Down
54 changes: 54 additions & 0 deletions test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,60 @@ test('parseSitemapXml handles sitemap index files', () => {
expect(entries[1].loc).toBe('https://example.com/sitemap-pages.xml')
})

test('parseSitemapXml decodes XML-escaped ampersands in urlset <loc> query params (issue #50)', () => {
// Per sitemaps.org, `&` in a URL must be written `&amp;`. Without decoding, the
// fetcher requests the literal `...&amp;...`, which the origin treats as a
// different (usually empty) request, silently dropping the page.
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/search?type=pages&amp;page=1&amp;lang=en</loc>
</url>
</urlset>`

const entries = parseSitemapXml(xml)
expect(entries).toHaveLength(1)
expect(entries[0].loc).toBe('https://example.com/search?type=pages&page=1&lang=en')
})

test('parseSitemapXml decodes XML-escaped ampersands in sitemapindex child <loc> (issue #50)', () => {
// The BigCommerce repro: every child sitemap of the index carries `&amp;` query
// params, so without decoding all child fetches fail and the audit returns zero
// URLs ("No auditable URLs found in sitemap.").
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://example.com/xmlsitemap.php?type=pages&amp;page=1</loc>
</sitemap>
<sitemap>
<loc>https://example.com/xmlsitemap.php?type=products&amp;page=1</loc>
</sitemap>
</sitemapindex>`

const entries = parseSitemapXml(xml)
expect(entries).toHaveLength(2)
expect(entries[0].loc).toBe('https://example.com/xmlsitemap.php?type=pages&page=1')
expect(entries[1].loc).toBe('https://example.com/xmlsitemap.php?type=products&page=1')
})

test('parseSitemapXml decodes numeric and named character references, with &amp; applied last', () => {
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/q?a=1&amp;b=2&#38;c=3&#x26;d=4</loc>
</url>
<url>
<loc>https://example.com/caf&#233;</loc>
</url>
</urlset>`

const entries = parseSitemapXml(xml)
expect(entries).toHaveLength(2)
// &amp;, &#38; (decimal) and &#x26; (hex) all resolve to a literal ampersand.
expect(entries[0].loc).toBe('https://example.com/q?a=1&b=2&c=3&d=4')
expect(entries[1].loc).toBe('https://example.com/café')
})

test('shouldSkipUrl filters non-HTML URLs', () => {
expect(shouldSkipUrl('https://example.com/doc.pdf')).toBe(true)
expect(shouldSkipUrl('https://example.com/image.png')).toBe(true)
Expand Down
Loading