diff --git a/Changelog.md b/Changelog.md index f6c4f97c..d847176c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -18,6 +18,16 @@ ### Bug Fixes +- Fix `unreachable!()` panic in the serde deserializer when a DOCTYPE + declaration appears between two text runs inside an element (e.g. + `xz`). The DOCTYPE used to break `drain_text`'s + consecutive-text merge, so two `DeEvent::Text` events reached + `read_text` and tripped its "Cannot be two consequent Text events" + invariant. DOCTYPE is now treated as transparent during text drain — + it still goes through the entity resolver, but the surrounding text + is merged into one run. Discovered via libFuzzer on a real-world + SAML deserializer harness. + ### Misc Changes diff --git a/src/de/mod.rs b/src/de/mod.rs index 7ff6f6fc..9aa005f4 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2372,10 +2372,20 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { /// Returns `true` when next event is not a text event in any form. #[inline(always)] const fn current_event_is_last_text(&self) -> bool { - // If next event is a text or CDATA, we should not trim trailing spaces + // If next event is a text-like event or a DocType (which is + // metadata and invisible to the data model), we should not + // trim trailing spaces — there is more content to drain, and + // any DocType between us and the next text run needs to be + // absorbed so `read_text` does not later see two consecutive + // `DeEvent::Text`. Without DocType here, an input like + // `xz` produces two text events for `x` + // and `z`, tripping `unreachable!()` in `read_text`. !matches!( self.lookahead, - Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_)) + Ok(PayloadEvent::Text(_) + | PayloadEvent::CData(_) + | PayloadEvent::GeneralRef(_) + | PayloadEvent::DocType(_)) ) } @@ -2383,6 +2393,10 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { /// occurs. Content of all events would be appended to `result` and returned /// as [`DeEvent::Text`]. /// + /// DocType events that fall between text events are absorbed by the + /// entity resolver and do not break the run — see + /// [`Self::current_event_is_last_text`] for the rationale. + /// /// [`Text`]: PayloadEvent::Text /// [`CData`]: PayloadEvent::CData fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result, DeError> { @@ -2399,9 +2413,16 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { .to_mut() .push_str(&e.xml_content(self.reader.xml_version())?), PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, + PayloadEvent::DocType(e) => { + self.entity_resolver + .capture(e) + .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?; + } - // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef - _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"), + // SAFETY: current_event_is_last_text checks that event is Text, CData, GeneralRef, or DocType + _ => unreachable!( + "Only `Text`, `CData`, `GeneralRef` or `DocType` events can come here" + ), } } Ok(DeEvent::Text(Text::new(result))) diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 7f1850a3..7e0071ee 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1896,6 +1896,84 @@ mod resolve { } } +/// A DOCTYPE declaration inside an element's text content used to split +/// the surrounding character data into two consecutive `DeEvent::Text` +/// events, tripping an `unreachable!()` in `read_text` ("Cannot be two +/// consequent Text events"). The reader emits Text/DocType/Text for an +/// input like `xz`, and `drain_text` did not drain +/// across DocType events, so the two text runs reached the +/// deserializer un-merged. Discovered via libFuzzer on a real-world +/// SAML response harness. +/// +/// The expected behavior is: DOCTYPE inside an element is captured by +/// the entity resolver (it has the same role as a prolog-level DOCTYPE +/// for entity definitions) and the surrounding text is treated as one +/// continuous run. +mod doctype_in_element_text { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, Deserialize, PartialEq)] + struct Wrapper { + #[serde(rename = "$text")] + text: String, + } + + /// Minimal regression repro: a single DOCTYPE event between two + /// adjacent text runs inside an element should not panic. + #[test] + fn single_doctype_between_text() { + let xml = r#"xz"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "xz".into() }); + } + + /// Multiple DOCTYPE events stacked between text runs (matching the + /// shape of the fuzzer-discovered input that originally surfaced + /// this panic on real SAML traffic). + #[test] + fn multiple_doctypes_between_text() { + let xml = r#"xw"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "xw".into() }); + } + + /// DOCTYPE followed by text only (no leading text). Should produce + /// the trailing text without panic. + #[test] + fn leading_doctype_then_text() { + let xml = r#"x"#; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!(got, Wrapper { text: "x".into() }); + } + + /// Whitespace adjacent to the DOCTYPE on both sides should be + /// preserved verbatim when the two text runs are merged — the + /// DOCTYPE is treated as transparent, not as a delimiter. + #[test] + fn whitespace_around_doctype() { + let xml = "x \tz"; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!( + got, + Wrapper { + text: "x \tz".into() + } + ); + + // Whitespace at the element boundaries (before the leading + // text and after the trailing text) is also preserved. + let xml = " xz "; + let got: Wrapper = from_str(xml).unwrap(); + assert_eq!( + got, + Wrapper { + text: " xz ".into() + } + ); + } +} + /// Tests for https://github.com/tafia/quick-xml/pull/603. /// /// According to comments,