From e00ae5c749155ee3001bd4629a12282011a0fdfb Mon Sep 17 00:00:00 2001 From: Will Reynolds Date: Wed, 13 May 2026 14:35:49 -0500 Subject: [PATCH] Fix unreachable!() panic when DOCTYPE appears between text runs in element content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deserializer's `drain_text` merges consecutive `Text`, `CData`, and `GeneralRef` payload events into a single `DeEvent::Text` so that `read_text` and friends see at most one text run per element. DOCTYPE events were not part of that merge: `drain_text`'s break condition (`current_event_is_last_text`) treats DocType as a non-text event and exits the loop, and the outer `next()` resumes by capturing the DocType and continuing — emitting a *second* `DeEvent::Text` for the trailing text run. For input like `xz` (and shapes derived from it), `read_text` then matched on a second `DeEvent::Text` and tripped `unreachable!()` with the comment 'Cannot be two consequent Text events, they would be merged into one'. The merge invariant was the right idea — the implementation just missed the DocType case. Fix: treat DocType as transparent during the text drain. It still goes through `entity_resolver.capture()` (so DTD entities defined inside the document body remain available for later `&entity;` resolution), but the surrounding text runs are merged into one `DeEvent::Text` and the `unreachable!()` is no longer reachable from valid or malformed real-world input. Discovered via libFuzzer running against a SAML deserializer harness on the consuming application — the input that surfaced it is a 100-byte sequence containing nested DOCTYPE declarations inside what the parser treats as element content. Adds four regression tests in a new `doctype_in_element_text` module in `tests/serde-de.rs`: a minimal repro (`xz`), a multi-DOCTYPE shape mirroring the original fuzzer find, a leading- DOCTYPE variant, and a whitespace-around-DOCTYPE variant (added per review feedback) confirming that adjacent whitespace is preserved verbatim when the surrounding text runs are merged. Full `cargo test --all-features` stays green (1,712 passing). --- Changelog.md | 10 ++++++ src/de/mod.rs | 29 +++++++++++++++--- tests/serde-de.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 4 deletions(-) diff --git a/Changelog.md b/Changelog.md index f6c4f97c..d847176c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -18,6 +18,16 @@ ### Bug Fixes +- Fix `unreachable!()` panic in the serde deserializer when a DOCTYPE + declaration appears between two text runs inside an element (e.g. + `xz`). The DOCTYPE used to break `drain_text`'s + consecutive-text merge, so two `DeEvent::Text` events reached + `read_text` and tripped its "Cannot be two consequent Text events" + invariant. DOCTYPE is now treated as transparent during text drain — + it still goes through the entity resolver, but the surrounding text + is merged into one run. Discovered via libFuzzer on a real-world + SAML deserializer harness. + ### Misc Changes diff --git a/src/de/mod.rs b/src/de/mod.rs index 7ff6f6fc..9aa005f4 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2372,10 +2372,20 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { /// Returns `true` when next event is not a text event in any form. #[inline(always)] const fn current_event_is_last_text(&self) -> bool { - // If next event is a text or CDATA, we should not trim trailing spaces + // If next event is a text-like event or a DocType (which is + // metadata and invisible to the data model), we should not + // trim trailing spaces — there is more content to drain, and + // any DocType between us and the next text run needs to be + // absorbed so `read_text` does not later see two consecutive + // `DeEvent::Text`. Without DocType here, an input like + // `xz` produces two text events for `x` + // and `z`, tripping `unreachable!()` in `read_text`. !matches!( self.lookahead, - Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_)) + Ok(PayloadEvent::Text(_) + | PayloadEvent::CData(_) + | PayloadEvent::GeneralRef(_) + | PayloadEvent::DocType(_)) ) } @@ -2383,6 +2393,10 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { /// occurs. Content of all events would be appended to `result` and returned /// as [`DeEvent::Text`]. /// + /// DocType events that fall between text events are absorbed by the + /// entity resolver and do not break the run — see + /// [`Self::current_event_is_last_text`] for the rationale. + /// /// [`Text`]: PayloadEvent::Text /// [`CData`]: PayloadEvent::CData fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result, DeError> { @@ -2399,9 +2413,16 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { .to_mut() .push_str(&e.xml_content(self.reader.xml_version())?), PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, + PayloadEvent::DocType(e) => { + self.entity_resolver + .capture(e) + .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?; + } - // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef - _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"), + // SAFETY: current_event_is_last_text checks that event is Text, CData, GeneralRef, or DocType + _ => unreachable!( + "Only `Text`, `CData`, `GeneralRef` or `DocType` events can come here" + ), } } Ok(DeEvent::Text(Text::new(result))) diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 7f1850a3..7e0071ee 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1896,6 +1896,84 @@ mod resolve { } } +/// A DOCTYPE declaration inside an element's text content used to split +/// the surrounding character data into two consecutive `DeEvent::Text` +/// events, tripping an `unreachable!()` in `read_text` ("Cannot be two +/// consequent Text events"). The reader emits Text/DocType/Text for an +/// input like `xz`, and `drain_text` did not drain +/// across DocType events, so the two text runs reached the +/// deserializer un-merged. Discovered via libFuzzer on a real-world +/// SAML response harness. +/// +/// The expected behavior is: DOCTYPE inside an element is captured by +/// the entity resolver (it has the same role as a prolog-level DOCTYPE +/// for entity definitions) and the surrounding text is treated as one +/// continuous run. +mod doctype_in_element_text { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, Deserialize, PartialEq)] + struct Wrapper { + #[serde(rename = "$text")] + text: String, + } + + /// Minimal regression repro: a single DOCTYPE event between two + /// adjacent text runs inside an element should not panic. + #[test] + fn single_doctype_between_text() { + let xml = r#"xz"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "xz".into() }); + } + + /// Multiple DOCTYPE events stacked between text runs (matching the + /// shape of the fuzzer-discovered input that originally surfaced + /// this panic on real SAML traffic). + #[test] + fn multiple_doctypes_between_text() { + let xml = r#"xw"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "xw".into() }); + } + + /// DOCTYPE followed by text only (no leading text). Should produce + /// the trailing text without panic. + #[test] + fn leading_doctype_then_text() { + let xml = r#"x"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "x".into() }); + } + + /// Whitespace adjacent to the DOCTYPE on both sides should be + /// preserved verbatim when the two text runs are merged — the + /// DOCTYPE is treated as transparent, not as a delimiter. + #[test] + fn whitespace_around_doctype() { + let xml = "x \tz"; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!( + got, + Wrapper { + text: "x \tz".into() + } + ); + + // Whitespace at the element boundaries (before the leading + // text and after the trailing text) is also preserved. + let xml = " xz "; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!( + got, + Wrapper { + text: " xz ".into() + } + ); + } +} + /// Tests for https://github.com/tafia/quick-xml/pull/603. /// /// According to comments,