diff --git a/Changelog.md b/Changelog.md
index f6c4f97c..d847176c 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -18,6 +18,16 @@
### Bug Fixes
+- Fix `unreachable!()` panic in the serde deserializer when a DOCTYPE
+ declaration appears between two text runs inside an element (e.g.
+ `xz`). The DOCTYPE used to break `drain_text`'s
+ consecutive-text merge, so two `DeEvent::Text` events reached
+ `read_text` and tripped its "Cannot be two consequent Text events"
+ invariant. DOCTYPE is now treated as transparent during text drain —
+ it still goes through the entity resolver, but the surrounding text
+ is merged into one run. Discovered via libFuzzer on a real-world
+ SAML deserializer harness.
+
### Misc Changes
diff --git a/src/de/mod.rs b/src/de/mod.rs
index 7ff6f6fc..9aa005f4 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -2372,10 +2372,20 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
/// Returns `true` when next event is not a text event in any form.
#[inline(always)]
const fn current_event_is_last_text(&self) -> bool {
- // If next event is a text or CDATA, we should not trim trailing spaces
+ // If next event is a text-like event or a DocType (which is
+ // metadata and invisible to the data model), we should not
+ // trim trailing spaces — there is more content to drain, and
+ // any DocType between us and the next text run needs to be
+ // absorbed so `read_text` does not later see two consecutive
+ // `DeEvent::Text`. Without DocType here, an input like
+ // `xz` produces two text events for `x`
+ // and `z`, tripping `unreachable!()` in `read_text`.
!matches!(
self.lookahead,
- Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_))
+ Ok(PayloadEvent::Text(_)
+ | PayloadEvent::CData(_)
+ | PayloadEvent::GeneralRef(_)
+ | PayloadEvent::DocType(_))
)
}
@@ -2383,6 +2393,10 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
/// occurs. Content of all events would be appended to `result` and returned
/// as [`DeEvent::Text`].
///
+ /// DocType events that fall between text events are absorbed by the
+ /// entity resolver and do not break the run — see
+ /// [`Self::current_event_is_last_text`] for the rationale.
+ ///
/// [`Text`]: PayloadEvent::Text
/// [`CData`]: PayloadEvent::CData
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result, DeError> {
@@ -2399,9 +2413,16 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
.to_mut()
.push_str(&e.xml_content(self.reader.xml_version())?),
PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?,
+ PayloadEvent::DocType(e) => {
+ self.entity_resolver
+ .capture(e)
+ .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?;
+ }
- // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef
- _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"),
+ // SAFETY: current_event_is_last_text checks that event is Text, CData, GeneralRef, or DocType
+ _ => unreachable!(
+ "Only `Text`, `CData`, `GeneralRef` or `DocType` events can come here"
+ ),
}
}
Ok(DeEvent::Text(Text::new(result)))
diff --git a/tests/serde-de.rs b/tests/serde-de.rs
index 7f1850a3..7e0071ee 100644
--- a/tests/serde-de.rs
+++ b/tests/serde-de.rs
@@ -1896,6 +1896,84 @@ mod resolve {
}
}
+/// A DOCTYPE declaration inside an element's text content used to split
+/// the surrounding character data into two consecutive `DeEvent::Text`
+/// events, tripping an `unreachable!()` in `read_text` ("Cannot be two
+/// consequent Text events"). The reader emits Text/DocType/Text for an
+/// input like `xz`, and `drain_text` did not drain
+/// across DocType events, so the two text runs reached the
+/// deserializer un-merged. Discovered via libFuzzer on a real-world
+/// SAML response harness.
+///
+/// The expected behavior is: DOCTYPE inside an element is captured by
+/// the entity resolver (it has the same role as a prolog-level DOCTYPE
+/// for entity definitions) and the surrounding text is treated as one
+/// continuous run.
+mod doctype_in_element_text {
+ use super::*;
+ use pretty_assertions::assert_eq;
+
+ #[derive(Debug, Deserialize, PartialEq)]
+ struct Wrapper {
+ #[serde(rename = "$text")]
+ text: String,
+ }
+
+ /// Minimal regression repro: a single DOCTYPE event between two
+ /// adjacent text runs inside an element should not panic.
+ #[test]
+ fn single_doctype_between_text() {
+ let xml = r#"xz"#;
+ let got: Wrapper = from_str(xml).unwrap();
+ assert_eq!(got, Wrapper { text: "xz".into() });
+ }
+
+ /// Multiple DOCTYPE events stacked between text runs (matching the
+ /// shape of the fuzzer-discovered input that originally surfaced
+ /// this panic on real SAML traffic).
+ #[test]
+ fn multiple_doctypes_between_text() {
+ let xml = r#"xw"#;
+ let got: Wrapper = from_str(xml).unwrap();
+ assert_eq!(got, Wrapper { text: "xw".into() });
+ }
+
+ /// DOCTYPE followed by text only (no leading text). Should produce
+ /// the trailing text without panic.
+ #[test]
+ fn leading_doctype_then_text() {
+ let xml = r#"x"#;
+ let got: Wrapper = from_str(xml).unwrap();
+ assert_eq!(got, Wrapper { text: "x".into() });
+ }
+
+ /// Whitespace adjacent to the DOCTYPE on both sides should be
+ /// preserved verbatim when the two text runs are merged — the
+ /// DOCTYPE is treated as transparent, not as a delimiter.
+ #[test]
+ fn whitespace_around_doctype() {
+ let xml = "x \tz";
+ let got: Wrapper = from_str(xml).unwrap();
+ assert_eq!(
+ got,
+ Wrapper {
+ text: "x \tz".into()
+ }
+ );
+
+ // Whitespace at the element boundaries (before the leading
+ // text and after the trailing text) is also preserved.
+ let xml = " xz ";
+ let got: Wrapper = from_str(xml).unwrap();
+ assert_eq!(
+ got,
+ Wrapper {
+ text: " xz ".into()
+ }
+ );
+ }
+}
+
/// Tests for https://github.com/tafia/quick-xml/pull/603.
///
/// According to comments,