From 6dacef34184dd36a753bc41da20ce12af5758484 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 09:34:55 -0500
Subject: [PATCH 01/18] Rewrite XML parser with tokenizer and XPath

---
 .gitignore                     |    1 +
 CHANGELOG.md                   |  157 ++
 Project.toml                   |    7 +-
 README.md                      |  270 ++-
 benchmarks/Project.toml        |    2 +-
 benchmarks/benchmarks.jl       |   99 +
 benchmarks/suite.jl            |   74 -
 src/XML.jl                     | 1559 +++++++++++----
 src/tokenizer.jl               |  480 +++++
 src/xpath.jl                   |  300 +++
 test/runtests.jl               | 3296 ++++++++++++++++++++++++++------
 test/test_libxml2_testcases.jl | 1578 +++++++++++++++
 test/test_remote_files.jl      |   77 +
 test/test_tokenizer.jl         |  425 ++++
 14 files changed, 7135 insertions(+), 1190 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 benchmarks/benchmarks.jl
 delete mode 100644 benchmarks/suite.jl
 create mode 100644 src/tokenizer.jl
 create mode 100644 src/xpath.jl
 create mode 100644 test/test_libxml2_testcases.jl
 create mode 100644 test/test_remote_files.jl
 create mode 100644 test/test_tokenizer.jl
diff --git a/.gitignore b/.gitignore
index b000475..432a8b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 *.xml
 *.gz
 *.DS_Store
+*.claude
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..3f00ed1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,157 @@
+# Changelog
+
+All notable changes to XML.jl will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+- New streaming tokenizer (`XMLTokenizer` module) for fine-grained XML token iteration.
+- XPath support via `xpath(node, path)`.
+- `test/test_libxml2_testcases.jl`: 243 test cases borrowed from the [libxml2](https://github.com/GNOME/libxml2) test suite covering CDATA, comments, processing instructions, attributes, namespaces, DTD internal subsets, entity references, whitespace handling, Unicode, error cases, and real-world document patterns.
+
+### Fixed
+- **Tokenizer: multi-byte UTF-8 in attribute values** — Parsing attribute values containing multi-byte UTF-8 characters (e.g., `<doc city="東京"/>`) could produce a `StringIndexError` because `attr_value()` used byte arithmetic (`ncodeunits - 1`) instead of `prevind` to strip quotes. The same issue existed in `_read_attr_value!`.
+- **Tokenizer: quotes inside DTD comments** — A `"` or `'` character inside a `<!-- -->` comment within a DTD internal subset caused the tokenizer to misinterpret it as a quoted string delimiter, leading to an "Unterminated quoted string" error. The DOCTYPE body parser now correctly skips comment content.
+
+## [0.3.8]
+
+### Fixed
+- `XML.write` now respects `xml:space="preserve"` and suppresses indentation for elements with this attribute ([#49]).
+
+## [0.3.7]
+
+### Fixed
+- Resolved remaining issues from [#45] and fixed [#46] (whitespace preservation edge cases) ([#47]).
+
+## [0.3.6]
+
+### Added
+- `XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation ([#45]).
+
+### Fixed
+- `String` type ambiguity on Julia nightly resolved ([#38]).
+
+## [0.3.5]
+
+### Fixed
+- `depth` and `parent` functions corrected to work properly with the DOM tree API ([#37]).
+- `escape` updated to no longer be idempotent — every `&` is now escaped, matching spec behavior ([#32], addressing [#31]).
+- `pushfirst!` support added for `Node` children ([#29]).
+
+## [0.3.4]
+
+### Fixed
+- Fixed [#26].
+- CI updated to use `julia-actions/cache@v4` and `lts` Julia version.
+
+## [0.3.3]
+
+### Added
+- `h` constructor for concise element creation (e.g., `h.div("hello"; class="main")`).
+
+### Fixed
+- Path definition error in README example ([#20]).
+
+## [0.3.2]
+
+### Fixed
+- Minor typos.
+
+## [0.3.1]
+
+### Added
+- Julia 1.6 compatibility ([#16]).
+
+### Changed
+- Smarter escaping logic.
+
+## [0.3.0]
+
+### Changed
+- Attribute internal representation changed from `Dict` to `OrderedDict` (later reverted to `Vector{Pair}`).
+
+## [0.2.3]
+
+### Fixed
+- Parse method fix.
+
+## [0.2.2]
+
+### Added
+- DTD parsing via `parse_dtd`.
+- `is_simple` and `simple_value` exports.
+- `setindex!` methods for modifying attributes.
+- `unescape` function.
+
+### Fixed
+- DOCTYPE parsing made case-insensitive.
+
+## [0.2.1]
+
+### Fixed
+- Write output fixes.
+
+## [0.2.0]
+
+### Changed
+- Major rewrite: introduced `NodeType` enum, `Node{S}` parametric struct, callable `NodeType` constructors, and `XML.write`.
+- Processing instruction support.
+- Benchmarks added.
+
+## [0.1.3]
+
+### Changed
+- Improved print output for `AbstractXMLNode`.
+
+## [0.1.2]
+
+### Added
+- AbstractTrees 0.4 compatibility ([#5]).
+
+## [0.1.1]
+
+### Added
+- `Node` implementation with `print_tree`.
+- Color output in REPL display.
+- Stopped stripping whitespace from text nodes.
+
+## [0.1.0]
+
+- Initial release.
+
+[Unreleased]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.8...HEAD
+[0.3.8]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.7...v0.3.8
+[0.3.7]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.6...v0.3.7
+[0.3.6]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.5...v0.3.6
+[0.3.5]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.4...v0.3.5
+[0.3.4]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.3...v0.3.4
+[0.3.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.2...v0.3.3
+[0.3.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.1...v0.3.2
+[0.3.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.0...v0.3.1
+[0.3.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.3...v0.3.0
+[0.2.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.2...v0.2.3
+[0.2.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.1...v0.2.2
+[0.2.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.0...v0.2.1
+[0.2.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.3...v0.2.0
+[0.1.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.2...v0.1.3
+[0.1.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.1...v0.1.2
+[0.1.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.0...v0.1.1
+[0.1.0]: https://github.com/JuliaComputing/XML.jl/releases/tag/v0.1.0
+
+[#5]: https://github.com/JuliaComputing/XML.jl/pull/5
+[#16]: https://github.com/JuliaComputing/XML.jl/pull/16
+[#20]: https://github.com/JuliaComputing/XML.jl/pull/20
+[#26]: https://github.com/JuliaComputing/XML.jl/issues/26
+[#29]: https://github.com/JuliaComputing/XML.jl/pull/29
+[#31]: https://github.com/JuliaComputing/XML.jl/issues/31
+[#32]: https://github.com/JuliaComputing/XML.jl/pull/32
+[#37]: https://github.com/JuliaComputing/XML.jl/pull/37
+[#38]: https://github.com/JuliaComputing/XML.jl/pull/38
+[#43]: https://github.com/JuliaComputing/XML.jl/issues/43
+[#45]: https://github.com/JuliaComputing/XML.jl/pull/45
+[#46]: https://github.com/JuliaComputing/XML.jl/issues/46
+[#47]: https://github.com/JuliaComputing/XML.jl/pull/47
+[#49]: https://github.com/JuliaComputing/XML.jl/pull/49
diff --git a/Project.toml b/Project.toml
index 49b96c0..3a894e8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,12 +1,7 @@
 name = "XML"
 uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
-authors = ["Josh Day <emailjoshday@gmail.com> and contributors"]
 version = "0.3.8"
-
-[deps]
-Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+authors = ["Josh Day <emailjoshday@gmail.com> and contributors"]
 
 [compat]
-OrderedCollections = "1.4, 1.5"
 julia = "1.6"
diff --git a/README.md b/README.md
index ddb1156..b446f61 100644
--- a/README.md
+++ b/README.md
@@ -4,39 +4,8 @@
 
 <p align="center">Read and write XML in pure Julia.</p>
 
-<br><br>
-
-# Introduction
-
-This package offers fast data structures for reading and writing XML files with a consistent interface:
-
 <br>
 
-### `Node`/`LazyNode` Interface:
-
-```
-nodetype(node)      →   XML.NodeType (an enum type)
-tag(node)           →   String or Nothing
-attributes(node)    →   OrderedDict{String, String} or Nothing
-value(node)         →   String or Nothing
-children(node)      →   Vector{typeof(node)}
-is_simple(node)     →   Bool (whether node is simple .e.g. <tag>item</tag>)
-simple_value(node)   →   e.g. "item" from <tag>item</tag>)
-```
-
-<br>
-
-### Extended Interface for `LazyNode`
-
-```
-depth(node)         →   Int
-next(node)          →   typeof(node)
-prev(node)          →   typeof(node)
-parent(node)        →   typeof(node)
-```
-
-<br><br>
-
 # Quickstart
 
 ```julia
@@ -58,79 +27,76 @@ doc[end][2]  # Second child of root
 # Node Element <book id="bk102"> (6 children)
 ```
 
-<br><br>
-
-# Data Structures that Represent XML Nodes
+<br>
 
-## Preliminary: `NodeType`
+# `Node` Interface
 
-- Each item in an XML DOM is classified by its `NodeType`.
-- Every `XML.jl` struct defines a `nodetype(x)` method that returns its `NodeType`.
+Every node in the XML DOM is represented by `Node`, a single type parametrized on its string storage.
 
-| NodeType | XML Representation | `Node` Constructor |
-|----------|--------------------|------------------|
-| `Document` | An entire document | `Document(children...)`
-| `DTD` | `<!DOCTYPE ...>` | `DTD(...) `
-| `Declaration` | `<?xml attributes... ?>` | `Declaration(; attrs...)`
-| `ProcessingInstruction` | `<?tag attributes... ?>` | `ProcessingInstruction(tag; attrs...)`
-| `Comment` | `<!-- text -->` | `Comment(text)`
-| `CData` | `<![CData[text]]>` | `CData(text)`
-| `Element` | `<tag attributes... > children... </NAME>` | `Element(tag, children...; attrs...)`
-| `Text` | the `text` part of `<tag>text</tag>` | `Text(text)`
+```
+nodetype(node)      -> XML.NodeType (an enum)
+tag(node)           -> String or Nothing
+attributes(node)    -> Dict{String, String} or Nothing
+value(node)         -> String or Nothing
+children(node)      -> Vector{Node}
+is_simple(node)     -> Bool (e.g. <tag>text</tag>)
+simple_value(node)  -> e.g. "text" from <tag>text</tag>
+```
 
 <br>
 
-## `Node`: Probably What You're Looking For
+## `NodeType`
 
-- `read`-ing a `Node` loads the entire XML DOM in memory.
-- See the table above for convenience constructors.
-- `Node`s have some additional methods that aid in construction/mutation:
+Each item in an XML DOM is classified by its `NodeType`:
 
-```julia
-# Add a child:
-push!(parent::Node, child::Node)
+| NodeType | XML Representation | Constructor |
+|----------|--------------------|-------------|
+| `Document` | An entire document | `Document(children...)` |
+| `DTD` | `<!DOCTYPE ...>` | `DTD(...)` |
+| `Declaration` | `<?xml attributes... ?>` | `Declaration(; attrs...)` |
+| `ProcessingInstruction` | `<?tag attributes... ?>` | `ProcessingInstruction(tag; attrs...)` |
+| `Comment` | `<!-- text -->` | `Comment(text)` |
+| `CData` | `<![CDATA[text]]>` | `CData(text)` |
+| `Element` | `<tag attrs...> children... </tag>` | `Element(tag, children...; attrs...)` |
+| `Text` | the `text` part of `<tag>text</tag>` | `Text(text)` |
 
-# Replace a child:
-parent[2] = child
-
-# Add/change an attribute:
-node["key"] = value
-
-node["key"]
-```
+<br>
 
-- `Node` is an immutable type.  However, you can easily create a copy with one or more field values changed by using the `Node(::Node, children...; attrs...)` constructor where `children` are appended to the source node's children and `attrs` are appended to the node's attributes.
+## Mutation
 
 ```julia
-node = XML.Element("tag", "child")
-# Node Element <tag> (1 child)
+push!(parent, child)   # Add a child
+parent[2] = child      # Replace a child
+node["key"] = "value"  # Add/change an attribute
+node["key"]            # Get an attribute
+```
 
-simple_value(node)
-# "child"
+<br>
 
-node2 = Node(node, "added"; id="my-id")
-# Node Element <tag id="my-id"> (2 children)
+## Tree Navigation
 
-node2.children
-# 2-element Vector{Node}:
-#  Node Text "child"
-#  Node Text "added"
+```julia
+depth(child, root)      # Depth of child relative to root
+parent(child, root)     # Parent of child within root's tree
+siblings(child, root)   # Siblings of child within root's tree
 ```
 
-### Writing `Element` `Node`s with `XML.h`
+<br>
+
+## Writing Elements with `XML.h`
 
 Similar to [Cobweb.jl](https://github.com/JuliaComputing/Cobweb.jl#-creating-nodes-with-cobwebh), `XML.h` enables you to write elements with a simpler syntax:
 
 ```julia
 using XML: h
 
-julia> node = h.parent(
-         h.child("first child content", id="id1"),
-         h.child("second child content", id="id2")
-       )
+node = h.parent(
+    h.child("first child content", id="id1"),
+    h.child("second child content", id="id2")
+)
 # Node Element <parent> (2 children)
 
-julia> print(XML.write(node))
+print(XML.write(node))
 # <parent>
 #   <child id="id1">first child content</child>
 #   <child id="id2">second child content</child>
@@ -139,111 +105,95 @@ julia> print(XML.write(node))
 
 <br>
 
-## `XML.LazyNode`: For Fast Iteration through an XML File
-
-A lazy data structure that just keeps track of the position in the raw data (`Vector{UInt8}`) to read from.
-
-- You can iterate over a `LazyNode` to "read" through an XML file:
-
-```julia
-doc = read(filename, LazyNode)
-
-foreach(println, doc)
-# LazyNode Declaration <?xml version="1.0"?>
-# LazyNode Element <catalog>
-# LazyNode Element <book id="bk101">
-# LazyNode Element <author>
-# LazyNode Text "Gambardella, Matthew"
-# LazyNode Element <title>
-# ⋮
-```
-
-<br><br>
-
 # Reading
 
 ```julia
-# Reading from file:
+# From a file:
 read(filename, Node)
-read(filename, LazyNode)
-
-# Parsing from string:
-parse(Node, str)
-parse(LazyNode, str)
 
+# From a string:
+parse(str, Node)
 ```
 
-<br><br>
+<br>
 
 # Writing
 
 ```julia
 XML.write(filename::String, node)  # write to file
-
-XML.write(io::IO, node)  # write to stream
-
-XML.write(node)  # String
+XML.write(io::IO, node)            # write to stream
+XML.write(node)                    # return String
 ```
 
+`XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation.
 
-<br><br>
-
-# Performance
-
-- XML.jl performs comparatively to [EzXML.jl](https://github.com/JuliaIO/EzXML.jl), which wraps the C library [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home).
-- See the `benchmarks/suite.jl` for the code to produce these results.
-- The following output was generated in a Julia session with the following `versioninfo`:
-
-```
-julia> versioninfo()
-Julia Version 1.9.4
-Commit 8e5136fa297 (2023-11-14 08:46 UTC)
-Build Info:
-  Official https://julialang.org/ release
-Platform Info:
-  OS: macOS (arm64-apple-darwin22.4.0)
-  CPU: 10 × Apple M1 Pro
-  WORD_SIZE: 64
-  LIBM: libopenlibm
-  LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
-  Threads: 8 on 8 virtual cores
-```
+<br>
 
+# XPath
 
-### Reading an XML File
+Query nodes using a subset of XPath 1.0 via `xpath(node, path)`:
 
-```
-       XML.LazyNode   0.009583
-           XML.Node  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1071.32
-      EzXML.readxml  ■■■■■■■■■ 284.346
-   XMLDict.xml_dict  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1231.47
-```
+```julia
+doc = parse("""
+<root>
+  <a id="1"><b>hello</b></a>
+  <a id="2"><b>world</b></a>
+</root>
+""", Node)
+
+root = doc[end]
+
+xpath(root, "//b")           # All <b> descendants
+xpath(root, "a[@id='2']/b")  # <b> inside <a id="2">
+xpath(root, "a[1]")          # First <a> child
+xpath(root, "//b/text()")    # Text nodes inside all <b>s
+```
+
+### Supported syntax
+
+| Expression | Description |
+|------------|-------------|
+| `/` | Root / path separator |
+| `tag` | Child element by name |
+| `*` | Any child element |
+| `//` | Descendant-or-self (recursive) |
+| `.` | Current node |
+| `..` | Parent node |
+| `[n]` | Positional predicate (1-based) |
+| `[@attr]` | Has-attribute predicate |
+| `[@attr='v']` | Attribute-value predicate |
+| `text()` | Text node children |
+| `node()` | All node children |
+| `@attr` | Attribute value (returns strings) |
 
-### Writing an XML File
+<br>
 
-```
-         Write: XML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 289.638
-       Write: EzXML  ■■■■■■■■■■■■■ 93.4631
-```
+# Streaming Tokenizer
 
-### Lazily Iterating over Each Node
-```
-           LazyNode  ■■■■■■■■■ 51.752
- EzXML.StreamReader  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 226.271
-```
+For large files or when you need fine-grained control, `XML.XMLTokenizer` provides a streaming tokenizer that yields tokens without building a DOM:
 
-### Collecting All Names/Tags in an XML File
-```
-       XML.LazyNode  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 210.482
- EzXML.StreamReader  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 276.238
-      EzXML.readxml  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 263.269
+```julia
+using XML.XMLTokenizer
+
+for token in tokenize("<root><child attr=\"val\">text</child></root>")
+    println(token.kind, " => ", repr(token.raw))
+end
+# TOKEN_OPEN_TAG_START => "<root"
+# TOKEN_CLOSE_TAG => ">"
+# TOKEN_OPEN_TAG_START => "<child"
+# TOKEN_ATTR_NAME => "attr"
+# TOKEN_ATTR_VALUE => "\"val\""
+# TOKEN_CLOSE_TAG => ">"
+# TOKEN_TEXT => "text"
+# TOKEN_END_TAG => "</child>"
+# TOKEN_END_TAG => "</root>"
 ```
 
 <br>
-<br>
 
-# Possible Gotchas
+# Escaping
+
+XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, `'`) for you. Use the provided utility functions:
 
-- XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you.  However, we provide utility functions for doing the conversions back and forth:
-  - `XML.escape(::String)` and `XML.unescape(::String)`
-  - `XML.escape!(::Node)` and `XML.unescape!(::Node)`.
+- `XML.escape(::String)` / `XML.unescape(::String)` -- transform strings.
+- `XML.escape!(::Node)` / `XML.unescape!(::Node)` -- transform an entire node tree in-place.
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
index ed90996..0598016 100644
--- a/benchmarks/Project.toml
+++ b/benchmarks/Project.toml
@@ -2,7 +2,7 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 XML = "72c71f33-b9b6-44de-8c94-c961784809e2"
 XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
new file mode 100644
index 0000000..aa558b9
--- /dev/null
+++ b/benchmarks/benchmarks.jl
@@ -0,0 +1,99 @@
+using XML
+using XML: Element, nodetype, tag, children
+using EzXML: EzXML
+using XMLDict: XMLDict
+using LightXML: LightXML
+using BenchmarkTools
+using DataFrames
+using UnicodePlots
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
+
+#-----------------------------------------------------------------------------# Test data
+# Small file (~120 lines)
+small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml")
+small_xml = read(small_file, String)
+
+df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
+
+macro add_benchmark(kind, name, expr...)
+    esc(:(let
+        @info string($kind, " - ", $name)
+        bench = @benchmark $(expr...)
+        push!(df, (; kind=$kind, name=$name, bench))
+    end))
+end
+
+#-----------------------------------------------------------------------------# Parse (small)
+@add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node)
+@add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml)
+@add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml)
+@add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml)
+
+#-----------------------------------------------------------------------------# Write (small)
+@add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Collect element tags
+function xml_collect_tags(node)
+    out = String[]
+    _xml_collect_tags!(out, node)
+    out
+end
+function _xml_collect_tags!(out, node)
+    for c in children(node)
+        if nodetype(c) === Element
+            push!(out, tag(c))
+            _xml_collect_tags!(out, c)
+        end
+    end
+end
+
+function ezxml_collect_tags(node::EzXML.Node)
+    out = String[]
+    _ezxml_collect_tags!(out, node)
+    out
+end
+function _ezxml_collect_tags!(out, node::EzXML.Node)
+    for child in EzXML.eachelement(node)
+        push!(out, child.name)
+        _ezxml_collect_tags!(out, child)
+    end
+end
+
+function lightxml_collect_tags(root::LightXML.XMLElement)
+    out = String[]
+    _lightxml_collect_tags!(out, root)
+    out
+end
+function _lightxml_collect_tags!(out, el::LightXML.XMLElement)
+    for child in LightXML.child_elements(el)
+        push!(out, LightXML.name(child))
+        _lightxml_collect_tags!(out, child)
+    end
+end
+
+@add_benchmark "Collect tags" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Collect tags" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Collect tags" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o))
+
+#-----------------------------------------------------------------------------# Results
+function plot_group(df, kind)
+    g = groupby(df, :kind)
+    haskey(g, (;kind)) || return
+    sub = g[(;kind)]
+    x = map(row -> "$(row.name)", eachrow(sub))
+    y = map(x -> median(x).time / 1e6, sub.bench)
+    display(barplot(x, y, title = "$kind — median time (ms)", border=:none, width=50))
+    println()
+end
+
+println("\n", "="^60)
+println("  BENCHMARK RESULTS")
+println("="^60, "\n")
+
+for kind in unique(df.kind)
+    plot_group(df, kind)
+end
diff --git a/benchmarks/suite.jl b/benchmarks/suite.jl
deleted file mode 100644
index e06dc61..0000000
--- a/benchmarks/suite.jl
+++ /dev/null
@@ -1,74 +0,0 @@
-using Pkg
-Pkg.activate(@__DIR__)
-
-using XML
-using EzXML: EzXML
-using XMLDict: XMLDict
-using BenchmarkTools
-using DataFrames
-using UnicodePlots
-using OrderedCollections: OrderedDict
-
-
-BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
-BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
-
-
-# nasa.xml was downloaded from:
-# http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/www/repository.html#nasa
-file = joinpath(@__DIR__, "nasa.xml")
-
-df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
-
-macro add_benchmark(kind, name, expr...)
-    esc(:(let
-        @info string($kind, " - ", $name)
-        bench = @benchmark $(expr...)
-        push!(df, (; kind=$kind, name=$name, bench))
-    end))
-end
-
-#-----------------------------------------------------------------------------# Write
-@add_benchmark "Write" "XML.write" XML.write($(tempname()), o) setup = (o = read(file, Node))
-@add_benchmark "Write" "EzXML.writexml" EzXML.write($(tempname()), o) setup = (o = EzXML.readxml(file))
-
-#-----------------------------------------------------------------------------# Read
-@add_benchmark "Read" "XML.LazyNode" read($file, LazyNode)
-@add_benchmark "Read" "XML.Node" read($file, Node)
-@add_benchmark "Read" "EzXML.readxml" EzXML.readxml($file)
-@add_benchmark "Read" "XMLDict.xml_dict" XMLDict.xml_dict(read($file, String))
-
-#-----------------------------------------------------------------------------# Lazy Iteration
-@add_benchmark "Lazy Iteration" "LazyNode" for x in read($file, LazyNode); end
-@add_benchmark "Lazy Iteration" "EzXML.StreamReader" (reader = open(EzXML.StreamReader, $file); for x in reader; end; close(reader))
-
-#-----------------------------------------------------------------------------# Lazy Iteration: Collect Tags
-@add_benchmark "Collect Tags" "LazyNode" [tag(x) for x in o] setup = (o = read(file, LazyNode))
-@add_benchmark "Collect Tags" "EzXML.StreamReader" [r.name for x in r if x == EzXML.READER_ELEMENT] setup=(r=open(EzXML.StreamReader, file)) teardown=(close(r))
-
-function get_tags(o::EzXML.Node)
-    out = String[]
-    for node in EzXML.eachelement(o)
-        push!(out, node.name)
-        for tag in get_tags(node)
-            push!(out, tag)
-        end
-    end
-    out
-end
-@add_benchmark "Collect Tags" "EzXML.readxml" get_tags(o.root) setup=(o = EzXML.readxml(file))
-
-
-#-----------------------------------------------------------------------------# Plots
-function plot(df, kind)
-    g = groupby(df, :kind)
-    sub = g[(;kind)]
-    x = map(row -> "$(row.name)", eachrow(sub))
-    y = map(x -> median(x).time / 1000^2, sub.bench)
-    display(barplot(x, y, title = "$kind Time (ms)", border=:none, width=50))
-end
-
-plot(df, "Read")
-plot(df, "Write")
-plot(df, "Lazy Iteration")
-plot(df, "Collect Tags")
diff --git a/src/XML.jl b/src/XML.jl
index 273bfda..8954140 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -1,31 +1,72 @@
 module XML
 
-using Mmap
-using OrderedCollections: OrderedDict
+include("tokenizer.jl")
+using .XMLTokenizer
 
 export
-    # Core Types:
-    Node, LazyNode,
-    # Interface:
-    children, nodetype, tag, attributes, value, is_simple, simplevalue, simple_value,
-    # Extended Interface for LazyNode:
-    parent, depth, next, prev
+    Node, NodeType,
+    CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
+    nodetype, tag, attributes, value, children,
+    is_simple, simple_value,
+    depth, siblings,
+    xpath,
+    h
 
 #-----------------------------------------------------------------------------# escape/unescape
-const escape_chars = ('&' => "&amp;", '<' => "&lt;", '>' => "&gt;", "'" => "&apos;", '"' => "&quot;")
+const escape_chars = ('&' => "&amp;", '<' => "&lt;", '>' => "&gt;", '\'' => "&apos;", '"' => "&quot;")
+
+"""
+    escape(x::AbstractString) -> String
+
+Escape the five XML predefined entities: `&` `<` `>` `'` `"`.
+
+!!! note "Changed in v0.4"
+    `escape` is no longer idempotent.  In previous versions, already-escaped sequences like
+    `&amp;` were left untouched.  Now every `&` is escaped, so `escape("&amp;")` produces
+    `"&amp;amp;"`.  Call `escape` only on raw, unescaped text.
+"""
+escape(x::AbstractString) = replace(x, escape_chars...)
+
 function unescape(x::AbstractString)
-    result = x
-    for (pat, r) in reverse.(escape_chars)
-        result = replace(result, pat => r)
-    end
-    return result
-end
-function escape(x::String)
-    result = replace(x, r"&(?!amp;|quot;|apos;|gt;|lt;)" => "&amp;")
-    for (pat, r) in escape_chars[2:end]
-        result = replace(result, pat => r)
+    occursin('&', x) || return string(x)
+    s = string(x)
+    io = IOBuffer(sizehint=ncodeunits(s))
+    i = 1
+    while i <= ncodeunits(s)
+        if s[i] == '&'
+            j = findnext(';', s, i + 1)
+            if !isnothing(j)
+                ref = SubString(s, i, j)
+                if ref == "&amp;"
+                    print(io, '&')
+                elseif ref == "&lt;"
+                    print(io, '<')
+                elseif ref == "&gt;"
+                    print(io, '>')
+                elseif ref == "&apos;"
+                    print(io, '\'')
+                elseif ref == "&quot;"
+                    print(io, '"')
+                elseif startswith(ref, "&#")
+                    is_hex = length(ref) > 3 && (ref[3] == 'x' || ref[3] == 'X')
+                    digits = SubString(s, i + (is_hex ? 3 : 2), j - 1)
+                    cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10)
+                    if !isnothing(cp) && isvalid(Char, cp)
+                        print(io, Char(cp))
+                    else
+                        print(io, ref)
+                    end
+                else
+                    print(io, ref)
+                end
+                i = j + 1
+                continue
+            end
+        end
+        print(io, s[i])
+        i = nextind(s, i)
     end
-    return result
+    String(take!(io))
 end
 
 #-----------------------------------------------------------------------------# NodeType
@@ -34,9 +75,9 @@ end
     - Document                  # prolog & root Element
     - DTD                       # <!DOCTYPE ...>
     - Declaration               # <?xml attributes... ?>
-    - ProcessingInstruction    # <?NAME attributes... ?>
+    - ProcessingInstruction    # <?NAME content... ?>
     - Comment                   # <!-- ... -->
-    - CData                     # <![CData[...]]>
+    - CData                     # <![CDATA[...]]>
     - Element                   # <NAME attributes... > children... </NAME>
     - Text                      # text
 
@@ -45,381 +86,1261 @@ NodeTypes can be used to construct XML.Nodes:
     Document(children...)
     DTD(value)
     Declaration(; attributes)
-    ProcessingInstruction(tag, attributes)
+    ProcessingInstruction(tag, content)
     Comment(text)
     CData(text)
     Element(tag, children...; attributes)
     Text(text)
 """
-@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text)
+@enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text
 
+#-----------------------------------------------------------------------------# Node
+struct Node{S}
+    nodetype::NodeType
+    tag::Union{Nothing, S}
+    attributes::Union{Nothing, Vector{Pair{S, S}}}
+    value::Union{Nothing, S}
+    children::Union{Nothing, Vector{Node{S}}}
 
-#-----------------------------------------------------------------------------# includes
-include("raw.jl")
-include("dtd.jl")
+    function Node{S}(nodetype::NodeType, tag, attributes, value, children) where {S}
+        if nodetype in (Text, Comment, CData, DTD)
+            isnothing(tag) && isnothing(attributes) && !isnothing(value) && isnothing(children) ||
+                error("$nodetype nodes only accept a value.")
+        elseif nodetype === Element
+            !isnothing(tag) && isnothing(value) ||
+                error("Element nodes require a tag and no value.")
+        elseif nodetype === Declaration
+            isnothing(tag) && isnothing(value) && isnothing(children) ||
+                error("Declaration nodes only accept attributes.")
+        elseif nodetype === ProcessingInstruction
+            !isnothing(tag) && isnothing(attributes) && isnothing(children) ||
+                error("ProcessingInstruction nodes require a tag and only accept a value.")
+        elseif nodetype === Document
+            isnothing(tag) && isnothing(attributes) && isnothing(value) ||
+                error("Document nodes only accept children.")
+        end
+        new{S}(nodetype, tag, attributes, value, children)
+    end
+end
 
-abstract type AbstractXMLNode end
+#-----------------------------------------------------------------------------# interface
+nodetype(o::Node) = o.nodetype
+tag(o::Node) = o.tag
 
-#-----------------------------------------------------------------------------# LazyNode
 """
-    LazyNode(file::AbstractString)
-    LazyNode(data::XML.Raw)
+    attributes(node::Node) -> Union{Nothing, Dict{String, String}}
 
-A Lazy representation of an XML node.
+Return the attributes of an `Element` or `Declaration` node as a `Dict`, or `nothing` if the
+node has no attributes.
+
+!!! note "Changed in v0.4"
+    In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl.
+    It now returns a standard `Dict`.  Attribute order is preserved internally but not exposed
+    by this function.  Use `node["key"]` for key-based access and `keys(node)` for ordered keys.
 """
-mutable struct LazyNode <: AbstractXMLNode
-    raw::Raw
-    tag::Union{Nothing, String}
-    attributes::Union{Nothing, OrderedDict{String, String}}
-    value::Union{Nothing, String}
-end
-LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing)
+attributes(o::Node) = isnothing(o.attributes) ? nothing : Dict(o.attributes)
 
-function Base.getproperty(o::LazyNode, x::Symbol)
-    x === :raw && return getfield(o, :raw)
-    x === :nodetype && return nodetype(o.raw)
-    x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x)
-    x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x)
-    x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x)
-    x === :depth && return depth(o.raw)
-    x === :children && return LazyNode.(children(o.raw))
-    error("type LazyNode has no field $(x)")
-end
-Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children)
+value(o::Node) = o.value
+children(o::Node) = something(o.children, ())
+
+is_simple(o::Node) = o.nodetype === Element &&
+    (isnothing(o.attributes) || isempty(o.attributes)) &&
+    !isnothing(o.children) && length(o.children) == 1 &&
+    o.children[1].nodetype in (Text, CData)
 
-Base.show(io::IO, o::LazyNode) = _show_node(io, o)
+simple_value(o::Node) = is_simple(o) ? o.children[1].value :
+    error("`simple_value` is only defined for simple nodes.")
 
-Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw))
-Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw))
-Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw))
+#-----------------------------------------------------------------------------# tree navigation
 
-children(o::LazyNode) = LazyNode.(children(o.raw))
-parent(o::LazyNode) = LazyNode(parent(o.raw))
-depth(o::LazyNode) = depth(o.raw)
+"""
+    parent(child::Node, root::Node) -> Node
+
+Return the parent of `child` within the tree rooted at `root`.
 
-Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown()
-Base.eltype(::Type{LazyNode}) = LazyNode
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found or if `child === root`.
+"""
+function Base.parent(child::Node, root::Node)
+    child === root && error("Root node has no parent.")
+    result = _find_parent(child, root)
+    isnothing(result) && error("Node not found in tree.")
+    result
+end
 
-function Base.iterate(o::LazyNode, state=o)
-    n = next(state)
-    return isnothing(n) ? nothing : (n, n)
+function _find_parent(child::Node, current::Node)
+    for c in children(current)
+        c === child && return current
+        result = _find_parent(child, c)
+        isnothing(result) || return result
+    end
+    nothing
 end
 
-function next(o::LazyNode)
-    n = next(o.raw)
-    isnothing(n) && return nothing
-    n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n)
+"""
+    depth(child::Node, root::Node) -> Int
+
+Return the depth of `child` within the tree rooted at `root` (root has depth 0).
+
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found in the tree.
+"""
+function depth(child::Node, root::Node)
+    child === root && return 0
+    result = _find_depth(child, root, 0)
+    isnothing(result) && error("Node not found in tree.")
+    result
 end
-function prev(o::LazyNode)
-    n = prev(o.raw)
-    isnothing(n) && return nothing
-    n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
+
+function _find_depth(child::Node, current::Node, d::Int)
+    for c in children(current)
+        c === child && return d + 1
+        result = _find_depth(child, c, d + 1)
+        isnothing(result) || return result
+    end
+    nothing
 end
 
-#-----------------------------------------------------------------------------# Node
 """
-    Node(nodetype, tag, attributes, value, children)
-    Node(node::Node; kw...)  # copy node with keyword overrides
-    Node(node::LazyNode)  # un-lazy the LazyNode
+    siblings(child::Node, root::Node) -> Vector{Node}
 
-A representation of an XML DOM node.  For simpler construction, use `(::NodeType)(args...)`
+Return the siblings of `child` (other children of the same parent) within the tree rooted
+at `root`.  The returned vector does not include `child` itself.
+
+Throws an error if `child` is the root or is not found in the tree.
 """
-struct Node <: AbstractXMLNode
-    nodetype::NodeType
-    tag::Union{Nothing, String}
-    attributes::Union{Nothing, OrderedDict{String, String}}
-    value::Union{Nothing, String}
-    children::Union{Nothing, Vector{Node}}
-
-    function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing)
-        new(nodetype,
-            isnothing(tag) ? nothing : string(tag),
-            isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)),
-            isnothing(value) ? nothing : string(value),
-            isnothing(children) ? nothing :
-                children isa Node ? [children] :
-                children isa Vector{Node} ? children :
-                children isa Vector ? map(Node, children) :
-                children isa Tuple ? map(Node, collect(children)) :
-                [Node(children)]
-        )
+function siblings(child::Node, root::Node)
+    p = parent(child, root)
+    [c for c in children(p) if c !== child]
+end
+
+include("xpath.jl")
+
+#-----------------------------------------------------------------------------# _to_node
+_to_node(n::Node{String}) = n
+_to_node(n::Node) = throw(ArgumentError("Expected Node{String}, got $(typeof(n))"))
+_to_node(x) = Node{String}(Text, nothing, nothing, string(x), nothing)
+
+#-----------------------------------------------------------------------------# NodeType constructors
+function (T::NodeType)(args...; attrs...)
+    S = String
+    if T in (Text, Comment, CData, DTD)
+        length(args) == 1 || error("$T nodes require exactly one value argument.")
+        !isempty(attrs) && error("$T nodes do not accept attributes.")
+        Node{S}(T, nothing, nothing, string(only(args)), nothing)
+    elseif T === Element
+        isempty(args) && error("Element nodes require at least a tag.")
+        t = string(first(args))
+        a = Pair{S,S}[String(k) => String(v) for (k, v) in pairs(attrs)]
+        c = Node{S}[_to_node(x) for x in args[2:end]]
+        Node{S}(T, t, a, nothing, c)
+    elseif T === Declaration
+        !isempty(args) && error("Declaration nodes only accept keyword attributes.")
+        a = isempty(attrs) ? nothing : [String(k) => String(v) for (k, v) in pairs(attrs)]
+        Node{S}(T, nothing, a, nothing, nothing)
+    elseif T === ProcessingInstruction
+        length(args) >= 1 || error("ProcessingInstruction nodes require a target.")
+        length(args) <= 2 || error("ProcessingInstruction nodes accept a target and optional content.")
+        !isempty(attrs) && error("ProcessingInstruction nodes do not accept attributes.")
+        t = string(args[1])
+        v = length(args) == 2 ? string(args[2]) : nothing
+        Node{S}(T, t, nothing, v, nothing)
+    elseif T === Document
+        !isempty(attrs) && error("Document nodes do not accept attributes.")
+        c = Node{S}[_to_node(x) for x in args]
+        Node{S}(T, nothing, nothing, nothing, c)
     end
 end
 
-function Node(o::Node, x...; kw...)
-    attrs = !isnothing(kw) ?
-        merge(
-            OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
-            isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
-        ) :
-        o.attributes
-    children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
-    Node(o.nodetype, o.tag, attrs, o.value, children)
+#-----------------------------------------------------------------------------# equality
+_eq(::Nothing, ::Nothing) = true
+_eq(::Nothing, b) = isempty(b)
+_eq(a, ::Nothing) = isempty(a)
+_eq(a, b) = a == b
+
+# Attribute equality is order-insensitive per XML spec
+function _attrs_eq(a, b)
+    a_empty = isnothing(a) || isempty(a)
+    b_empty = isnothing(b) || isempty(b)
+    a_empty && b_empty && return true
+    (a_empty != b_empty) && return false
+    length(a) != length(b) && return false
+    for p in a
+        p in b || return false
+    end
+    true
 end
 
-function Node(node::LazyNode)
-    nodetype = node.nodetype
-    tag = node.tag
-    attributes = node.attributes
-    value = node.value
-    c = XML.children(node)
-    Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
+function Base.:(==)(a::Node, b::Node)
+    a.nodetype == b.nodetype &&
+    a.tag == b.tag &&
+    _attrs_eq(a.attributes, b.attributes) &&
+    a.value == b.value &&
+    _eq(a.children, b.children)
 end
 
-Node(data::Raw) = Node(LazyNode(data))
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(o::Node, i::Integer) = children(o)[i]
+Base.getindex(o::Node, ::Colon) = children(o)
+Base.lastindex(o::Node) = lastindex(children(o))
+Base.only(o::Node) = only(children(o))
+Base.length(o::Node) = length(children(o))
 
-# Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node
-Node(x) = Node(Text, nothing, nothing, string(x), nothing)
+function Base.get(o::Node, key::AbstractString, default)
+    isnothing(o.attributes) && return default
+    for (k, v) in o.attributes
+        k == key && return v
+    end
+    default
+end
 
-h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children)
-Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
-(o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...)
+const _MISSING_ATTR = gensym(:missing_attr)
 
-# NOT in-place for Text Nodes
-function escape!(o::Node, warn::Bool=true)
-    if o.nodetype == Text
-        warn && @warn "escape!() called on a Text Node creates a new node."
-        return Text(escape(o.value))
+function Base.getindex(o::Node, key::AbstractString)
+    val = get(o, key, _MISSING_ATTR)
+    val === _MISSING_ATTR && throw(KeyError(key))
+    val
+end
+
+function Base.haskey(o::Node, key::AbstractString)
+    get(o, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+Base.keys(o::Node) = isnothing(o.attributes) ? () : first.(o.attributes)
+
+#-----------------------------------------------------------------------------# mutation
+function Base.setindex!(o::Node, val, i::Integer)
+    isnothing(o.children) && error("Node has no children.")
+    o.children[i] = _to_node(val)
+end
+
+function Base.setindex!(o::Node, val, key::AbstractString)
+    isnothing(o.attributes) && error("Node has no attributes.")
+    v = string(val)
+    for i in eachindex(o.attributes)
+        if first(o.attributes[i]) == key
+            o.attributes[i] = key => v
+            return v
+        end
     end
-    isnothing(o.children) && return o
-    map!(x -> escape!(x, false), o.children, o.children)
-    o
+    push!(o.attributes, key => v)
+    v
 end
-function unescape!(o::Node, warn::Bool=true)
-    if o.nodetype == Text
-        warn && @warn "unescape!() called on a Text Node creates a new node."
-        return Text(unescape(o.value))
+
+function Base.push!(a::Node, b)
+    isnothing(a.children) && error("Node does not accept children.")
+    push!(a.children, _to_node(b))
+    a
+end
+
+function Base.pushfirst!(a::Node, b)
+    isnothing(a.children) && error("Node does not accept children.")
+    pushfirst!(a.children, _to_node(b))
+    a
+end
+
+#-----------------------------------------------------------------------------# show (REPL)
+function Base.show(io::IO, o::Node)
+    nt = o.nodetype
+    printstyled(io, nt; color=:light_green)
+    if nt === Text
+        printstyled(io, ' ', repr(o.value))
+    elseif nt === Element
+        printstyled(io, " <", o.tag; color=:light_cyan)
+        if !isnothing(o.attributes)
+            for (k, v) in o.attributes
+                print(io, ' ', k, '=', '"', v, '"')
+            end
+        end
+        printstyled(io, '>'; color=:light_cyan)
+        n = length(children(o))
+        n > 0 && printstyled(io, n == 1 ? " (1 child)" : " ($n children)"; color=:light_black)
+    elseif nt === DTD
+        printstyled(io, " <!DOCTYPE "; color=:light_cyan)
+        printstyled(io, o.value; color=:light_black)
+        printstyled(io, '>'; color=:light_cyan)
+    elseif nt === Declaration
+        printstyled(io, " <?xml"; color=:light_cyan)
+        if !isnothing(o.attributes)
+            for (k, v) in o.attributes
+                print(io, ' ', k, '=', '"', v, '"')
+            end
+        end
+        printstyled(io, "?>"; color=:light_cyan)
+    elseif nt === ProcessingInstruction
+        printstyled(io, " <?", o.tag; color=:light_cyan)
+        !isnothing(o.value) && print(io, ' ', o.value)
+        printstyled(io, "?>"; color=:light_cyan)
+    elseif nt === Comment
+        printstyled(io, " <!--"; color=:light_cyan)
+        printstyled(io, o.value; color=:light_black)
+        printstyled(io, "-->"; color=:light_cyan)
+    elseif nt === CData
+        printstyled(io, " <![CDATA["; color=:light_cyan)
+        printstyled(io, o.value; color=:light_black)
+        printstyled(io, "]]>"; color=:light_cyan)
+    elseif nt === Document
+        n = length(children(o))
+        n > 0 && printstyled(io, n == 1 ? " (1 child)" : " ($n children)"; color=:light_black)
     end
-    isnothing(o.children) && return o
-    map!(x -> unescape!(x, false), o.children, o.children)
-    o
 end
 
+#-----------------------------------------------------------------------------# show (text/xml)
+function _print_attrs(io::IO, attributes)
+    isnothing(attributes) && return
+    for (k, v) in attributes
+        print(io, ' ', k, '=', '"', escape(v), '"')
+    end
+end
 
-Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
-Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
-Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw))
+function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false)
+    pad = preserve ? "" : ' ' ^ (indent * depth)
+    nt = node.nodetype
+    if nt === Text
+        print(io, escape(node.value))
+    elseif nt === Element
+        # Check xml:space on this element
+        child_preserve = preserve
+        if !isnothing(node.attributes)
+            for (k, v) in node.attributes
+                k == "xml:space" && (child_preserve = v == "preserve")
+            end
+        end
+        print(io, pad, '<', node.tag)
+        _print_attrs(io, node.attributes)
+        ch = node.children
+        if isnothing(ch) || isempty(ch)
+            print(io, "/>")
+        elseif length(ch) == 1 && only(ch).nodetype === Text
+            print(io, '>')
+            _write_xml(io, only(ch), 0, 0, child_preserve)
+            print(io, "</", node.tag, '>')
+        else
+            child_preserve ? print(io, '>') : println(io, '>')
+            for child in ch
+                _write_xml(io, child, depth + 1, indent, child_preserve)
+                child_preserve || println(io)
+            end
+            print(io, child_preserve ? "" : pad, "</", node.tag, '>')
+        end
+    elseif nt === Declaration
+        print(io, pad, "<?xml")
+        _print_attrs(io, node.attributes)
+        print(io, "?>")
+    elseif nt === ProcessingInstruction
+        print(io, pad, "<?", node.tag)
+        isnothing(node.value) || print(io, ' ', node.value)
+        print(io, "?>")
+    elseif nt === Comment
+        print(io, pad, "<!--", node.value, "-->")
+    elseif nt === CData
+        print(io, pad, "<![CDATA[", node.value, "]]>")
+    elseif nt === DTD
+        print(io, pad, "<!DOCTYPE ", node.value, '>')
+    elseif nt === Document
+        ch = node.children
+        if !isnothing(ch)
+            for (i, child) in enumerate(ch)
+                _write_xml(io, child, 0, indent, preserve)
+                i < length(ch) && println(io)
+            end
+        end
+    end
+end
 
-Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
-Base.push!(a::Node, b::Node) = push!(a.children, b)
-Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b)
+Base.show(io::IO, ::MIME"text/xml", node::Node) = _write_xml(io, node)
 
-Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val))
-Base.getindex(o::Node, val::AbstractString) = o.attributes[val]
-Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key)
-Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes)
+#-----------------------------------------------------------------------------# write / read
+write(node::Node; indentsize::Int=2) = (io = IOBuffer(); _write_xml(io, node, 0, indentsize); String(take!(io)))
+write(filename::AbstractString, node::Node; kw...) = open(io -> write(io, node; kw...), filename, "w")
+write(io::IO, node::Node; indentsize::Int=2) = _write_xml(io, node, 0, indentsize)
 
-Base.show(io::IO, o::Node) = _show_node(io, o)
+Base.read(filename::AbstractString, ::Type{Node}) = parse(read(filename, String), Node)
+Base.read(io::IO, ::Type{Node}) = parse(read(io, String), Node)
 
-#-----------------------------------------------------------------------------# Node Constructors
-function (T::NodeType)(args...; attr...)
-    if T === Document
-        !isempty(attr) && error("Document nodes do not have attributes.")
-        Node(T, nothing, nothing, nothing, args)
-    elseif T === DTD
-        !isempty(attr) && error("DTD nodes only accept a value.")
-        length(args) > 1 && error("DTD nodes only accept a value.")
-        Node(T, nothing, nothing, only(args))
-    elseif T === Declaration
-        !isempty(args) && error("Declaration nodes only accept attributes")
-        Node(T, nothing, attr)
-    elseif T === ProcessingInstruction
-        length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.")
-        Node(T, only(args), attr)
-    elseif T === Comment
-        !isempty(attr) && error("Comment nodes do not have attributes.")
-        length(args) > 1 && error("Comment nodes only accept a single input.")
-        Node(T, nothing, nothing, only(args))
-    elseif T === CData
-        !isempty(attr) && error("CData nodes do not have attributes.")
-        length(args) > 1 && error("CData nodes only accept a single input.")
-        Node(T, nothing, nothing, only(args))
-    elseif T === Text
-        !isempty(attr) && error("Text nodes do not have attributes.")
-        length(args) > 1 && error("Text nodes only accept a single input.")
-        Node(T, nothing, nothing, only(args))
-    elseif T === Element
-        tag = first(args)
-        Node(T, tag, attr, nothing, args[2:end])
-    else
-        error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).")
+#-----------------------------------------------------------------------------# parse
+Base.parse(::Type{Node}, xml::AbstractString) = parse(xml, Node)
+
+function Base.parse(xml::AbstractString, ::Type{Node})
+    _parse(String(xml), String, unescape)
+end
+
+function Base.parse(xml::AbstractString, ::Type{Node{SubString{String}}})
+    _parse(String(xml), SubString{String}, identity)
+end
+
+_to(::Type{String}, s::AbstractString) = String(s)
+_to(::Type{SubString{String}}, s::SubString{String}) = s
+
+function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
+    tags = S[]
+    attrs_stack = Vector{Union{Nothing, Vector{Pair{S,S}}}}()
+    children_stack = Vector{Vector{Node{S}}}()
+    push!(children_stack, Node{S}[])
+
+    pending_attr_name = SubString(xml, 1, 0)
+    decl_attrs = nothing
+    pending_pi_tag = SubString(xml, 1, 0)
+    pending_pi_value = nothing
+    in_close_tag = false
+
+    for token in tokenize(xml)
+        k = token.kind
+
+        if k === TOKEN_TEXT
+            push!(last(children_stack), Node{S}(Text, nothing, nothing, convert_text(token.raw), nothing))
+
+        elseif k === TOKEN_OPEN_TAG
+            push!(tags, _to(S, tag_name(token)))
+            push!(attrs_stack, nothing)
+            push!(children_stack, Node{S}[])
+
+        elseif k === TOKEN_SELF_CLOSE
+            t = pop!(tags)
+            a = pop!(attrs_stack)
+            pop!(children_stack)
+            push!(last(children_stack), Node{S}(Element, t, a, nothing, nothing))
+
+        elseif k === TOKEN_TAG_CLOSE
+            in_close_tag && (in_close_tag = false)
+
+        elseif k === TOKEN_CLOSE_TAG
+            close_name = tag_name(token)
+            isempty(tags) && error("Closing tag </$close_name> with no matching open tag.")
+            t = pop!(tags)
+            t == close_name || error("Mismatched tags: expected </$t>, got </$close_name>.")
+            a = pop!(attrs_stack)
+            c = pop!(children_stack)
+            push!(last(children_stack), Node{S}(Element, t, a, nothing, isempty(c) ? nothing : c))
+            in_close_tag = true
+
+        elseif k === TOKEN_ATTR_NAME
+            pending_attr_name = token.raw
+
+        elseif k === TOKEN_ATTR_VALUE
+            val = convert_text(attr_value(token))
+            name = _to(S, pending_attr_name)
+            if decl_attrs !== nothing
+                any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name")
+                push!(decl_attrs, name => val)
+            elseif !isempty(attrs_stack)
+                if isnothing(last(attrs_stack))
+                    attrs_stack[end] = Pair{S,S}[]
+                end
+                any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name")
+                push!(last(attrs_stack), name => val)
+            end
+
+        elseif k === TOKEN_XML_DECL_OPEN
+            decl_attrs = Pair{S,S}[]
+
+        elseif k === TOKEN_XML_DECL_CLOSE
+            a = isempty(decl_attrs) ? nothing : decl_attrs
+            push!(last(children_stack), Node{S}(Declaration, nothing, a, nothing, nothing))
+            decl_attrs = nothing
+
+        elseif k === TOKEN_COMMENT_CONTENT
+            push!(last(children_stack), Node{S}(Comment, nothing, nothing, _to(S, token.raw), nothing))
+
+        elseif k === TOKEN_CDATA_CONTENT
+            push!(last(children_stack), Node{S}(CData, nothing, nothing, _to(S, token.raw), nothing))
+
+        elseif k === TOKEN_DOCTYPE_CONTENT
+            push!(last(children_stack), Node{S}(DTD, nothing, nothing, _to(S, lstrip(token.raw)), nothing))
+
+        elseif k === TOKEN_PI_OPEN
+            pending_pi_tag = pi_target(token)
+            pending_pi_value = nothing
+
+        elseif k === TOKEN_PI_CONTENT
+            content = strip(token.raw)
+            pending_pi_value = isempty(content) ? nothing : _to(S, content)
+
+        elseif k === TOKEN_PI_CLOSE
+            push!(last(children_stack), Node{S}(ProcessingInstruction, _to(S, pending_pi_tag), nothing, pending_pi_value, nothing))
+        end
     end
+
+    !isempty(tags) && error("Unclosed tags: $(join(tags, ", "))")
+    doc_children = only(children_stack)
+    Node{S}(Document, nothing, nothing, nothing, isempty(doc_children) ? nothing : doc_children)
 end
 
-#-----------------------------------------------------------------------------# !!! common !!!
-# Everything below here is common to all data structures
+#-----------------------------------------------------------------------------# h (HTML/XML element builder)
+"""
+    h(tag, children...; attrs...)
+    h.tag(children...; attrs...)
 
+Convenience constructor for `Element` nodes.
 
-#-----------------------------------------------------------------------------# interface fallbacks
-nodetype(o) = o.nodetype
-tag(o) = o.tag
-attributes(o) = o.attributes
-value(o) = o.value
-children(o::T) where {T} = isnothing(o.children) ? () : o.children
+    h("div", "hello"; class="main")  # <div class="main">hello</div>
+    h.div("hello"; class="main")     # same thing
+"""
+function h(tag::Union{Symbol, AbstractString}, children...; attrs...)
+    t = String(tag)
+    a = Pair{String,String}[String(k) => String(v) for (k, v) in pairs(attrs)]
+    c = Node{String}[_to_node(x) for x in children]
+    Node{String}(Element, t, a, nothing, c)
+end
 
-depth(o) = missing
-parent(o) = missing
-next(o) = missing
-prev(o) = missing
+Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
 
-is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) &&
-    length(children(o)) == 1 && nodetype(only(o)) in (Text, CData)
+function (o::Node)(args...; attrs...)
+    o.nodetype === Element || error("Only Element nodes are callable.")
+    old_children = something(o.children, ())
+    old_attrs = isnothing(o.attributes) ? () : (Symbol(k) => v for (k, v) in o.attributes)
+    h(o.tag, old_children..., args...; old_attrs..., attrs...)
+end
 
-simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.")
+#-----------------------------------------------------------------------------# DTD parsing
+struct ElementDecl
+    name::String
+    content::String  # "EMPTY", "ANY", or content model like "(#PCDATA)" or "(a,b,c)*"
+end
 
-Base.@deprecate_binding simplevalue simple_value
+struct AttDecl
+    element::String
+    name::String
+    type::String     # "CDATA", "ID", "(val1|val2)", "NOTATION (a|b)", etc.
+    default::String  # "#REQUIRED", "#IMPLIED", "#FIXED \"val\"", or "\"val\""
+end
 
-#-----------------------------------------------------------------------------# nodes_equal
-function nodes_equal(a, b)
-    out = XML.tag(a) == XML.tag(b)
-    out &= XML.nodetype(a) == XML.nodetype(b)
-    out &= XML.attributes(a) == XML.attributes(b)
-    out &= XML.value(a) == XML.value(b)
-    out &= length(XML.children(a)) == length(XML.children(b))
-    out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b)))
-    return out
+struct EntityDecl
+    name::String
+    value::Union{Nothing, String}       # replacement text (internal entities)
+    external_id::Union{Nothing, String} # "SYSTEM \"uri\"" or "PUBLIC \"pubid\" \"uri\""
+    parameter::Bool
 end
 
-Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b)
+struct NotationDecl
+    name::String
+    external_id::String
+end
 
-#-----------------------------------------------------------------------------# parse
-Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T)
+struct ParsedDTD
+    root::String
+    system_id::Union{Nothing, String}
+    public_id::Union{Nothing, String}
+    elements::Vector{ElementDecl}
+    attributes::Vector{AttDecl}
+    entities::Vector{EntityDecl}
+    notations::Vector{NotationDecl}
+end
 
-#-----------------------------------------------------------------------------# indexing
-Base.getindex(o::Union{Raw, AbstractXMLNode}) = o
-Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i]
-Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o)
-Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o))
-
-Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o))
-
-Base.length(o::AbstractXMLNode) = length(children(o))
-
-#-----------------------------------------------------------------------------# printing
-function _show_node(io::IO, o)
-    printstyled(io, typeof(o), ' '; color=:light_black)
-    !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black)
-    printstyled(io, nodetype(o), ; color=:light_green)
-    if o.nodetype === Text
-        printstyled(io, ' ', repr(value(o)))
-    elseif o.nodetype === Element
-        printstyled(io, " <", tag(o), color=:light_cyan)
-        _print_attrs(io, o; color=:light_yellow)
-        printstyled(io, '>', color=:light_cyan)
-        _print_n_children(io, o)
-    elseif o.nodetype === DTD
-        printstyled(io, " <!DOCTYPE "; color=:light_cyan)
-        printstyled(io, value(o), color=:light_black)
-        printstyled(io, '>', color=:light_cyan)
-    elseif o.nodetype === Declaration
-        printstyled(io, " <?xml", color=:light_cyan)
-        _print_attrs(io, o; color=:light_yellow)
-        printstyled(io, "?>", color=:light_cyan)
-    elseif o.nodetype === ProcessingInstruction
-        printstyled(io, " <?", tag(o), color=:light_cyan)
-        _print_attrs(io, o; color=:light_yellow)
-        printstyled(io, "?>", color=:light_cyan)
-    elseif o.nodetype === Comment
-        printstyled(io, " <!--", color=:light_cyan)
-        printstyled(io, value(o), color=:light_black)
-        printstyled(io, "-->", color=:light_cyan)
-    elseif o.nodetype === CData
-        printstyled(io, " <![CData[", color=:light_cyan)
-        printstyled(io, value(o), color=:light_black)
-        printstyled(io, "]]>", color=:light_cyan)
-    elseif o.nodetype === Document
-        _print_n_children(io, o)
-    elseif o.nodetype === UNKNOWN
-        printstyled(io, "Unknown", color=:light_cyan)
-        _print_n_children(io, o)
-    else
-        error("Unreachable reached")
+# DTD parsing helpers
+@inline _dtd_is_name_char(c::Char) =
+    ('a' <= c <= 'z') || ('A' <= c <= 'Z') || ('0' <= c <= '9') ||
+    c == '_' || c == '-' || c == '.' || c == ':'
+
+function _dtd_skip_ws(s, pos)
+    while pos <= ncodeunits(s) && isspace(s[pos])
+        pos += 1
     end
+    pos
 end
 
-function _print_attrs(io::IO, o; color=:normal)
-    attr = attributes(o)
-    isnothing(attr) && return nothing
-    for (k,v) in attr
-        # printstyled(io, ' ', k, '=', '"', v, '"'; color)
-        print(io, ' ', k, '=', '"', v, '"')
+function _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    start = pos
+    while pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+        pos += 1
     end
+    start == pos && error("Expected name at position $pos in DTD")
+    SubString(s, start, pos - 1), pos
 end
-function _print_n_children(io::IO, o::Node)
-    n = length(children(o))
-    text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)"
-    printstyled(io, text, color=:light_black)
-end
-_print_n_children(io::IO, o) = nothing
-
-#-----------------------------------------------------------------------------# write_xml
-write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
-
-write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
-
-function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
-    indent = ' ' ^ indentsize
-    nodetype = XML.nodetype(x)
-    tag = XML.tag(x)
-    value = XML.value(x)
-    children = XML.children(x)
-
-    padding = indent ^ max(0, depth - 1)
-    !ctx[end] && print(io, padding)
-
-    if nodetype === Text
-        print(io, value)
-
-    elseif nodetype === Element
-        push!(ctx, ctx[end])
-        update_ctx!(ctx, x)
-        print(io, '<', tag)
-        _print_attrs(io, x)
-        print(io, isempty(children) ? '/' : "", '>')
-        if !isempty(children)
-            if length(children) == 1 && XML.nodetype(only(children)) === Text
-                write(io, only(children), ctx; indentsize=0)
-                print(io, "</", tag, '>')
-            else
-                !ctx[end] && println(io)
-                foreach(children) do child
-                    write(io, child, ctx; indentsize, depth=depth + 1)
-                    !ctx[end] && println(io)
-                end
-                print(io, !ctx[end] ? padding : "", "</", tag, '>')
+
+function _dtd_read_quoted(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    q = s[pos]
+    (q == '"' || q == '\'') || error("Expected quoted string at position $pos in DTD")
+    pos += 1
+    start = pos
+    while pos <= ncodeunits(s) && s[pos] != q
+        pos += 1
+    end
+    val = SubString(s, start, pos - 1)
+    pos += 1
+    val, pos
+end
+
+function _dtd_read_parens(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    s[pos] == '(' || error("Expected '(' at position $pos in DTD")
+    depth = 1
+    start = pos
+    pos += 1
+    while pos <= ncodeunits(s) && depth > 0
+        c = s[pos]
+        if c == '('
+            depth += 1
+        elseif c == ')'
+            depth -= 1
+        elseif c == '"' || c == '\''
+            pos += 1
+            while pos <= ncodeunits(s) && s[pos] != c
+                pos += 1
             end
         end
-        pop!(ctx)
+        pos += 1
+    end
+    SubString(s, start, pos - 1), pos
+end
 
-    elseif nodetype === DTD
-        print(io, "<!DOCTYPE ", value, '>')
+function _dtd_skip_to_close(s, pos)
+    while pos <= ncodeunits(s) && s[pos] != '>'
+        c = s[pos]
+        if c == '"' || c == '\''
+            pos += 1
+            while pos <= ncodeunits(s) && s[pos] != c
+                pos += 1
+            end
+        end
+        pos += 1
+    end
+    pos <= ncodeunits(s) ? pos + 1 : pos
+end
 
-    elseif nodetype === Declaration
-        print(io, "<?xml")
-        _print_attrs(io, x)
-        print(io, "?>")
+function _dtd_parse_element(s, pos)
+    name, pos = _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    if s[pos] == '('
+        content, pos = _dtd_read_parens(s, pos)
+        if pos <= ncodeunits(s) && s[pos] in ('*', '+', '?')
+            content = string(content, s[pos])
+            pos += 1
+        end
+    else
+        content, pos = _dtd_read_name(s, pos)
+    end
+    pos = _dtd_skip_to_close(s, pos)
+    ElementDecl(String(name), String(content)), pos
+end
 
-    elseif nodetype === ProcessingInstruction
-        print(io, "<?", tag)
-        _print_attrs(io, x)
-        print(io, "?>")
+function _dtd_parse_attlist(s, pos)
+    element, pos = _dtd_read_name(s, pos)
+    atts = AttDecl[]
+    while true
+        pos = _dtd_skip_ws(s, pos)
+        (pos > ncodeunits(s) || s[pos] == '>') && break
+
+        name, pos = _dtd_read_name(s, pos)
+        pos = _dtd_skip_ws(s, pos)
+
+        # Attribute type
+        if s[pos] == '('
+            atype, pos = _dtd_read_parens(s, pos)
+        else
+            atype, pos = _dtd_read_name(s, pos)
+            if atype == "NOTATION"
+                pos = _dtd_skip_ws(s, pos)
+                parens, pos = _dtd_read_parens(s, pos)
+                atype = string("NOTATION ", parens)
+            end
+        end
+        pos = _dtd_skip_ws(s, pos)
 
-    elseif nodetype === Comment
-        print(io, "<!--", value, "-->")
+        # Default declaration
+        if s[pos] == '#'
+            pos += 1
+            keyword, pos = _dtd_read_name(s, pos)
+            if keyword == "FIXED"
+                pos = _dtd_skip_ws(s, pos)
+                val, pos = _dtd_read_quoted(s, pos)
+                default = string("#FIXED \"", val, "\"")
+            else
+                default = string("#", keyword)
+            end
+        elseif s[pos] == '"' || s[pos] == '\''
+            val, pos = _dtd_read_quoted(s, pos)
+            default = string("\"", val, "\"")
+        else
+            error("Expected default declaration at position $pos in DTD")
+        end
+        push!(atts, AttDecl(String(element), String(name), String(atype), default))
+    end
+    pos <= ncodeunits(s) && s[pos] == '>' && (pos += 1)
+    atts, pos
+end
 
-    elseif nodetype === CData
-        print(io, "<![CData[", value, "]]>")
+function _dtd_parse_entity(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    parameter = false
+    if pos <= ncodeunits(s) && s[pos] == '%'
+        parameter = true
+        pos += 1
+    end
+    name, pos = _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
 
-    elseif nodetype === Document
-        foreach(children) do child
-            write(io, child, ctx; indentsize)
-            !ctx[end] && println(io)
+    value = nothing
+    external_id = nothing
+    if s[pos] == '"' || s[pos] == '\''
+        v, pos = _dtd_read_quoted(s, pos)
+        value = String(v)
+    else
+        keyword, pos = _dtd_read_name(s, pos)
+        pos = _dtd_skip_ws(s, pos)
+        if keyword == "SYSTEM"
+            uri, pos = _dtd_read_quoted(s, pos)
+            external_id = string("SYSTEM \"", uri, "\"")
+        elseif keyword == "PUBLIC"
+            pubid, pos = _dtd_read_quoted(s, pos)
+            pos = _dtd_skip_ws(s, pos)
+            uri, pos = _dtd_read_quoted(s, pos)
+            external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+        else
+            error("Expected SYSTEM, PUBLIC, or quoted value in ENTITY declaration")
         end
+    end
+    pos = _dtd_skip_to_close(s, pos)
+    EntityDecl(String(name), value, external_id, parameter), pos
+end
 
+function _dtd_parse_notation(s, pos)
+    name, pos = _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    keyword, pos = _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+    if keyword == "SYSTEM"
+        uri, pos = _dtd_read_quoted(s, pos)
+        external_id = string("SYSTEM \"", uri, "\"")
+    elseif keyword == "PUBLIC"
+        pubid, pos = _dtd_read_quoted(s, pos)
+        pos = _dtd_skip_ws(s, pos)
+        if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+            uri, pos = _dtd_read_quoted(s, pos)
+            external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+        else
+            external_id = string("PUBLIC \"", pubid, "\"")
+        end
     else
-        error("Unreachable case reached during XML.write")
+        error("Expected SYSTEM or PUBLIC in NOTATION declaration")
+    end
+    pos = _dtd_skip_to_close(s, pos)
+    NotationDecl(String(name), external_id), pos
+end
+
+"""
+    parse_dtd(value::AbstractString) -> ParsedDTD
+    parse_dtd(node::Node) -> ParsedDTD
+
+Parse a DTD value string (from a `DTD` node) into structured declarations.
+"""
+function parse_dtd(value::AbstractString)
+    s = String(value)
+    pos = 1
+
+    root, pos = _dtd_read_name(s, pos)
+    pos = _dtd_skip_ws(s, pos)
+
+    # External ID
+    system_id = nothing
+    public_id = nothing
+    if pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+        keyword, kpos = _dtd_read_name(s, pos)
+        if keyword == "SYSTEM"
+            pos = kpos
+            uri, pos = _dtd_read_quoted(s, pos)
+            system_id = String(uri)
+        elseif keyword == "PUBLIC"
+            pos = kpos
+            pubid, pos = _dtd_read_quoted(s, pos)
+            public_id = String(pubid)
+            pos = _dtd_skip_ws(s, pos)
+            if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+                uri, pos = _dtd_read_quoted(s, pos)
+                system_id = String(uri)
+            end
+        end
+    end
+
+    elements = ElementDecl[]
+    attributes = AttDecl[]
+    entities = EntityDecl[]
+    notations = NotationDecl[]
+
+    # Internal subset
+    pos = _dtd_skip_ws(s, pos)
+    if pos <= ncodeunits(s) && s[pos] == '['
+        pos += 1
+        while pos <= ncodeunits(s)
+            pos = _dtd_skip_ws(s, pos)
+            pos > ncodeunits(s) && break
+            s[pos] == ']' && break
+
+            rest = SubString(s, pos)
+            if startswith(rest, "<!--")
+                i = findnext("-->", s, pos + 4)
+                isnothing(i) && error("Unterminated comment in DTD")
+                pos = last(i) + 1
+            elseif startswith(rest, "<?")
+                i = findnext("?>", s, pos + 2)
+                isnothing(i) && error("Unterminated PI in DTD")
+                pos = last(i) + 1
+            elseif startswith(rest, "<!ELEMENT")
+                elem, pos = _dtd_parse_element(s, pos + 9)
+                push!(elements, elem)
+            elseif startswith(rest, "<!ATTLIST")
+                atts, pos = _dtd_parse_attlist(s, pos + 9)
+                append!(attributes, atts)
+            elseif startswith(rest, "<!ENTITY")
+                ent, pos = _dtd_parse_entity(s, pos + 8)
+                push!(entities, ent)
+            elseif startswith(rest, "<!NOTATION")
+                not, pos = _dtd_parse_notation(s, pos + 10)
+                push!(notations, not)
+            elseif s[pos] == '%'
+                i = findnext(';', s, pos + 1)
+                isnothing(i) && error("Unterminated parameter entity reference in DTD")
+                pos = i + 1
+            else
+                pos += 1
+            end
+        end
     end
 
+    ParsedDTD(String(root), system_id, public_id, elements, attributes, entities, notations)
+end
+
+function parse_dtd(node::Node)
+    node.nodetype === DTD || error("parse_dtd requires a DTD node.")
+    parse_dtd(node.value)
+end
+
+# #-----------------------------------------------------------------------------# includes
+# include("raw.jl")
+# include("dtd.jl")
+
+# abstract type AbstractXMLNode end
+
+# #-----------------------------------------------------------------------------# LazyNode
+# """
+#     LazyNode(file::AbstractString)
+#     LazyNode(data::XML.Raw)
+
+# A Lazy representation of an XML node.
+# """
+# mutable struct LazyNode <: AbstractXMLNode
+#     raw::Raw
+#     tag::Union{Nothing, String}
+#     attributes::Union{Nothing, OrderedDict{String, String}}
+#     value::Union{Nothing, String}
+# end
+# LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing)
+
+# function Base.getproperty(o::LazyNode, x::Symbol)
+#     x === :raw && return getfield(o, :raw)
+#     x === :nodetype && return nodetype(o.raw)
+#     x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x)
+#     x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x)
+#     x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x)
+#     x === :depth && return depth(o.raw)
+#     x === :children && return LazyNode.(children(o.raw))
+#     error("type LazyNode has no field $(x)")
+# end
+# Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children)
+
+# Base.show(io::IO, o::LazyNode) = _show_node(io, o)
+
+# Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw))
+# Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw))
+# Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw))
+
+# children(o::LazyNode) = LazyNode.(children(o.raw))
+# parent(o::LazyNode) = LazyNode(parent(o.raw))
+# depth(o::LazyNode) = depth(o.raw)
+
+# Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown()
+# Base.eltype(::Type{LazyNode}) = LazyNode
+
+# function Base.iterate(o::LazyNode, state=o)
+#     n = next(state)
+#     return isnothing(n) ? nothing : (n, n)
+# end
+
+# function next(o::LazyNode)
+#     n = next(o.raw)
+#     isnothing(n) && return nothing
+#     n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n)
+# end
+# function prev(o::LazyNode)
+#     n = prev(o.raw)
+#     isnothing(n) && return nothing
+#     n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
+# end
+
+# #-----------------------------------------------------------------------------# Node
+# """
+#     Node(nodetype, tag, attributes, value, children)
+#     Node(node::Node; kw...)  # copy node with keyword overrides
+#     Node(node::LazyNode)  # un-lazy the LazyNode
+
+# A representation of an XML DOM node.  For simpler construction, use `(::NodeType)(args...)`
+# """
+# struct Node <: AbstractXMLNode
+#     nodetype::NodeType
+#     tag::Union{Nothing, String}
+#     attributes::Union{Nothing, OrderedDict{String, String}}
+#     value::Union{Nothing, String}
+#     children::Union{Nothing, Vector{Node}}
+
+#     function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing)
+#         new(nodetype,
+#             isnothing(tag) ? nothing : string(tag),
+#             isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)),
+#             isnothing(value) ? nothing : string(value),
+#             isnothing(children) ? nothing :
+#                 children isa Node ? [children] :
+#                 children isa Vector{Node} ? children :
+#                 children isa Vector ? map(Node, children) :
+#                 children isa Tuple ? map(Node, collect(children)) :
+#                 [Node(children)]
+#         )
+#     end
+# end
+
+# function Node(o::Node, x...; kw...)
+#     attrs = !isnothing(kw) ?
+#         merge(
+#             OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
+#             isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
+#         ) :
+#         o.attributes
+#     children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
+#     Node(o.nodetype, o.tag, attrs, o.value, children)
+# end
+
+# function Node(node::LazyNode)
+#     nodetype = node.nodetype
+#     tag = node.tag
+#     attributes = node.attributes
+#     value = node.value
+#     c = XML.children(node)
+#     Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
+# end
+
+# Node(data::Raw) = Node(LazyNode(data))
+
+# # Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node
+# Node(x) = Node(Text, nothing, nothing, string(x), nothing)
+
+# h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children)
+# Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
+# (o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...)
+
+# # NOT in-place for Text Nodes
+# function escape!(o::Node, warn::Bool=true)
+#     if o.nodetype == Text
+#         warn && @warn "escape!() called on a Text Node creates a new node."
+#         return Text(escape(o.value))
+#     end
+#     isnothing(o.children) && return o
+#     map!(x -> escape!(x, false), o.children, o.children)
+#     o
+# end
+# function unescape!(o::Node, warn::Bool=true)
+#     if o.nodetype == Text
+#         warn && @warn "unescape!() called on a Text Node creates a new node."
+#         return Text(unescape(o.value))
+#     end
+#     isnothing(o.children) && return o
+#     map!(x -> unescape!(x, false), o.children, o.children)
+#     o
+# end
+
+
+# Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
+# Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
+# Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw))
+
+# Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
+# Base.push!(a::Node, b::Node) = push!(a.children, b)
+# Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b)
+
+# Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val))
+# Base.getindex(o::Node, val::AbstractString) = o.attributes[val]
+# Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key)
+# Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes)
+
+# Base.show(io::IO, o::Node) = _show_node(io, o)
+
+# #-----------------------------------------------------------------------------# Node Constructors
+# function (T::NodeType)(args...; attr...)
+#     if T === Document
+#         !isempty(attr) && error("Document nodes do not have attributes.")
+#         Node(T, nothing, nothing, nothing, args)
+#     elseif T === DTD
+#         !isempty(attr) && error("DTD nodes only accept a value.")
+#         length(args) > 1 && error("DTD nodes only accept a value.")
+#         Node(T, nothing, nothing, only(args))
+#     elseif T === Declaration
+#         !isempty(args) && error("Declaration nodes only accept attributes")
+#         Node(T, nothing, attr)
+#     elseif T === ProcessingInstruction
+#         length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.")
+#         Node(T, only(args), attr)
+#     elseif T === Comment
+#         !isempty(attr) && error("Comment nodes do not have attributes.")
+#         length(args) > 1 && error("Comment nodes only accept a single input.")
+#         Node(T, nothing, nothing, only(args))
+#     elseif T === CData
+#         !isempty(attr) && error("CData nodes do not have attributes.")
+#         length(args) > 1 && error("CData nodes only accept a single input.")
+#         Node(T, nothing, nothing, only(args))
+#     elseif T === Text
+#         !isempty(attr) && error("Text nodes do not have attributes.")
+#         length(args) > 1 && error("Text nodes only accept a single input.")
+#         Node(T, nothing, nothing, only(args))
+#     elseif T === Element
+#         tag = first(args)
+#         Node(T, tag, attr, nothing, args[2:end])
+#     else
+#         error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).")
+#     end
+# end
+
+# #-----------------------------------------------------------------------------# !!! common !!!
+# # Everything below here is common to all data structures
+
+
+# #-----------------------------------------------------------------------------# interface fallbacks
+# nodetype(o) = o.nodetype
+# tag(o) = o.tag
+# attributes(o) = o.attributes
+# value(o) = o.value
+# children(o::T) where {T} = isnothing(o.children) ? () : o.children
+
+# depth(o) = missing
+# parent(o) = missing
+# next(o) = missing
+# prev(o) = missing
+
+# is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) &&
+#     length(children(o)) == 1 && nodetype(only(o)) in (Text, CData)
+
+# simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.")
+
+# Base.@deprecate_binding simplevalue simple_value
+
+# #-----------------------------------------------------------------------------# nodes_equal
+# function nodes_equal(a, b)
+#     out = XML.tag(a) == XML.tag(b)
+#     out &= XML.nodetype(a) == XML.nodetype(b)
+#     out &= XML.attributes(a) == XML.attributes(b)
+#     out &= XML.value(a) == XML.value(b)
+#     out &= length(XML.children(a)) == length(XML.children(b))
+#     out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b)))
+#     return out
+# end
+
+# Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b)
+
+# #-----------------------------------------------------------------------------# parse
+# Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T)
+
+# #-----------------------------------------------------------------------------# indexing
+# Base.getindex(o::Union{Raw, AbstractXMLNode}) = o
+# Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i]
+# Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o)
+# Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o))
+
+# Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o))
+
+# Base.length(o::AbstractXMLNode) = length(children(o))
+
+# #-----------------------------------------------------------------------------# printing
+# function _show_node(io::IO, o)
+#     printstyled(io, typeof(o), ' '; color=:light_black)
+#     !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black)
+#     printstyled(io, nodetype(o), ; color=:light_green)
+#     if o.nodetype === Text
+#         printstyled(io, ' ', repr(value(o)))
+#     elseif o.nodetype === Element
+#         printstyled(io, " <", tag(o), color=:light_cyan)
+#         _print_attrs(io, o; color=:light_yellow)
+#         printstyled(io, '>', color=:light_cyan)
+#         _print_n_children(io, o)
+#     elseif o.nodetype === DTD
+#         printstyled(io, " <!DOCTYPE "; color=:light_cyan)
+#         printstyled(io, value(o), color=:light_black)
+#         printstyled(io, '>', color=:light_cyan)
+#     elseif o.nodetype === Declaration
+#         printstyled(io, " <?xml", color=:light_cyan)
+#         _print_attrs(io, o; color=:light_yellow)
+#         printstyled(io, "?>", color=:light_cyan)
+#     elseif o.nodetype === ProcessingInstruction
+#         printstyled(io, " <?", tag(o), color=:light_cyan)
+#         _print_attrs(io, o; color=:light_yellow)
+#         printstyled(io, "?>", color=:light_cyan)
+#     elseif o.nodetype === Comment
+#         printstyled(io, " <!--", color=:light_cyan)
+#         printstyled(io, value(o), color=:light_black)
+#         printstyled(io, "-->", color=:light_cyan)
+#     elseif o.nodetype === CData
+#         printstyled(io, " <![CData[", color=:light_cyan)
+#         printstyled(io, value(o), color=:light_black)
+#         printstyled(io, "]]>", color=:light_cyan)
+#     elseif o.nodetype === Document
+#         _print_n_children(io, o)
+#     elseif o.nodetype === UNKNOWN
+#         printstyled(io, "Unknown", color=:light_cyan)
+#         _print_n_children(io, o)
+#     else
+#         error("Unreachable reached")
+#     end
+# end
+
+# function _print_attrs(io::IO, o; color=:normal)
+#     attr = attributes(o)
+#     isnothing(attr) && return nothing
+#     for (k,v) in attr
+#         # printstyled(io, ' ', k, '=', '"', v, '"'; color)
+#         print(io, ' ', k, '=', '"', v, '"')
+#     end
+# end
+# function _print_n_children(io::IO, o::Node)
+#     n = length(children(o))
+#     text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)"
+#     printstyled(io, text, color=:light_black)
+# end
+# _print_n_children(io::IO, o) = nothing
+
+# #-----------------------------------------------------------------------------# write_xml
+# write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
+
+# write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
+
+# function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
+#     indent = ' ' ^ indentsize
+#     nodetype = XML.nodetype(x)
+#     tag = XML.tag(x)
+#     value = XML.value(x)
+#     children = XML.children(x)
+
+#     padding = indent ^ max(0, depth - 1)
+#     !ctx[end] && print(io, padding)
+
+#     if nodetype === Text
+#         print(io, value)
+
+#     elseif nodetype === Element
+#         push!(ctx, ctx[end])
+#         update_ctx!(ctx, x)
+#         print(io, '<', tag)
+#         _print_attrs(io, x)
+#         print(io, isempty(children) ? '/' : "", '>')
+#         if !isempty(children)
+#             if length(children) == 1 && XML.nodetype(only(children)) === Text
+#                 write(io, only(children), ctx; indentsize=0)
+#                 print(io, "</", tag, '>')
+#             else
+#                 !ctx[end] && println(io)
+#                 foreach(children) do child
+#                     write(io, child, ctx; indentsize, depth=depth + 1)
+#                     !ctx[end] && println(io)
+#                 end
+#                 print(io, !ctx[end] ? padding : "", "</", tag, '>')
+#             end
+#         end
+#         pop!(ctx)
+
+#     elseif nodetype === DTD
+#         print(io, "<!DOCTYPE ", value, '>')
+
+#     elseif nodetype === Declaration
+#         print(io, "<?xml")
+#         _print_attrs(io, x)
+#         print(io, "?>")
+
+#     elseif nodetype === ProcessingInstruction
+#         print(io, "<?", tag)
+#         _print_attrs(io, x)
+#         print(io, "?>")
+
+#     elseif nodetype === Comment
+#         print(io, "<!--", value, "-->")
+
+#     elseif nodetype === CData
+#         print(io, "<![CData[", value, "]]>")
+
+#     elseif nodetype === Document
+#         foreach(children) do child
+#             write(io, child, ctx; indentsize)
+#             !ctx[end] && println(io)
+#         end
+
+#     else
+#         error("Unreachable case reached during XML.write")
+#     end
+
+# end
+
+#-----------------------------------------------------------------------------# deprecations
+Base.@deprecate_binding simplevalue simple_value false
+Base.@deprecate_binding LazyNode Node false
+
+# Removed types — informative errors
+struct Raw
+    Raw(args...; kw...) = error("""
+        `XML.Raw` has been removed in XML.jl v0.4.
+        Use `parse(str, Node)` or `read(filename, Node)` instead.
+        The streaming Raw/LazyNode API has been replaced by a token-based parser.
+        See `?XML.Node` for the new API.""")
+end
+
+struct AbstractXMLNode
+    AbstractXMLNode(args...; kw...) = error("""
+        `XML.AbstractXMLNode` has been removed in XML.jl v0.4.
+        `Node` is no longer a subtype of an abstract type.
+        Dispatch on `Node` directly instead.""")
+end
+
+# Removed functions — informative errors
+const _REMOVED_LAZYNODE_MSG = """
+    This function was part of the LazyNode API, which has been removed in XML.jl v0.4.
+    Use `parse(str, Node)` to get a full DOM tree and navigate with `children`, `tag`,
+    `attributes`, `value`, and integer indexing (e.g. `node[1]`)."""
+
+for f in (:next, :prev)
+    msg = "`XML.$f` has been removed. $_REMOVED_LAZYNODE_MSG"
+    @eval function $f(o::Node)
+        Base.depwarn($msg, $(QuoteNode(f)))
+        error($msg)
+    end
+end
+
+# 1-arg parent/depth were part of LazyNode API; 2-arg versions are defined above
+const _PARENT_1ARG_MSG = "`XML.parent(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n    Use `parent(child, root)` instead to search from a known root node."
+function Base.parent(o::Node)
+    Base.depwarn(_PARENT_1ARG_MSG, :parent)
+    error(_PARENT_1ARG_MSG)
+end
+
+const _DEPTH_1ARG_MSG = "`XML.depth(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n    Use `depth(child, root)` instead to search from a known root node."
+function depth(o::Node)
+    Base.depwarn(_DEPTH_1ARG_MSG, :depth)
+    error(_DEPTH_1ARG_MSG)
+end
+
+function nodes_equal(a, b)
+    msg = """`XML.nodes_equal` has been removed in XML.jl v0.4. Use `==` instead:
+        a == b"""
+    Base.depwarn(msg, :nodes_equal)
+    error(msg)
+end
+
+function escape!(o::Node, warn::Bool=true)
+    msg = """`XML.escape!` has been removed in XML.jl v0.4.
+        Text is now escaped automatically during `XML.write`."""
+    Base.depwarn(msg, :escape!)
+    error(msg)
+end
+
+function unescape!(o::Node, warn::Bool=true)
+    msg = """`XML.unescape!` has been removed in XML.jl v0.4.
+        Text is now unescaped automatically during `parse`."""
+    Base.depwarn(msg, :unescape!)
+    error(msg)
 end
 
 end # module XML
diff --git a/src/tokenizer.jl b/src/tokenizer.jl
new file mode 100644
index 0000000..355036d
--- /dev/null
+++ b/src/tokenizer.jl
@@ -0,0 +1,480 @@
+"""
+    XMLTokenizer
+
+A self-contained module for tokenizing XML documents into a fine-grained stream of tokens.
+
+# Usage
+
+```julia
+using .XMLTokenizer: tokenize, tag_name, attr_value, pi_target
+
+for token in tokenize(\"\"\"<?xml version="1.0"?><root attr="val">text<!-- comment --></root>\"\"\")
+    println(token)
+end
+```
+"""
+module XMLTokenizer
+
+export tokenize, tag_name, attr_value, pi_target, TokenKind, Token,
+    TOKEN_TEXT,
+    TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
+    TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+    TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
+    TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
+    TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
+    TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
+    TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
+
+#-----------------------------------------------------------------------# TokenKind
+@enum TokenKind::UInt8 begin
+    # Character data
+    TOKEN_TEXT               # text content between markup
+
+    # Element tags
+    TOKEN_OPEN_TAG           # <name
+    TOKEN_CLOSE_TAG          # </name
+    TOKEN_TAG_CLOSE          # >
+    TOKEN_SELF_CLOSE         # />
+    TOKEN_ATTR_NAME          # attribute name
+    TOKEN_ATTR_VALUE         # "value" or 'value' (with quotes in raw)
+
+    # CDATA sections
+    TOKEN_CDATA_OPEN         # <![CDATA[
+    TOKEN_CDATA_CONTENT      # raw text content
+    TOKEN_CDATA_CLOSE        # ]]>
+
+    # Comments
+    TOKEN_COMMENT_OPEN       # <!--
+    TOKEN_COMMENT_CONTENT    # comment text
+    TOKEN_COMMENT_CLOSE      # -->
+
+    # Processing instructions
+    TOKEN_PI_OPEN            # <?target (includes target name)
+    TOKEN_PI_CONTENT         # PI body text
+    TOKEN_PI_CLOSE           # ?>
+
+    # XML declaration (<?xml ...?>)
+    TOKEN_XML_DECL_OPEN      # <?xml
+    TOKEN_XML_DECL_CLOSE     # ?>
+    # (reuses TOKEN_ATTR_NAME / TOKEN_ATTR_VALUE for pseudo-attributes)
+
+    # DOCTYPE
+    TOKEN_DOCTYPE_OPEN       # <!DOCTYPE (or other <! declarations)
+    TOKEN_DOCTYPE_CONTENT    # declaration body
+    TOKEN_DOCTYPE_CLOSE      # >
+end
+
+#-----------------------------------------------------------------------# Token
+struct Token
+    kind::TokenKind
+    raw::SubString{String}
+end
+
+function Base.show(io::IO, t::Token)
+    print(io, t.kind, ": ", repr(String(t.raw)))
+end
+
+#-----------------------------------------------------------------------# Tokenizer state
+@enum _State::UInt8 begin
+    _S_DEFAULT            # normal content mode
+    _S_TAG                # inside open tag, reading attributes
+    _S_TAG_VALUE          # expecting quoted attribute value
+    _S_CLOSE_TAG          # inside close tag, expecting >
+    _S_XML_DECL           # inside <?xml, reading pseudo-attributes
+    _S_XML_DECL_VALUE     # expecting quoted attr value in xml decl
+    _S_COMMENT            # after <!--, reading content
+    _S_CDATA              # after <![CDATA[, reading content
+    _S_PI                 # after <?target, reading content
+    _S_DOCTYPE            # after <!DOCTYPE, reading content
+end
+
+#-----------------------------------------------------------------------# Tokenizer
+mutable struct Tokenizer
+    const data::String
+    pos::Int
+    state::_State
+    pending::Union{Token,Nothing}
+end
+
+"""
+    tokenize(xml::String) -> Tokenizer
+
+Return a lazy iterator of `Token`s over the XML string `xml`.
+"""
+tokenize(xml::String) = Tokenizer(xml, 1, _S_DEFAULT, nothing)
+
+Base.IteratorSize(::Type{Tokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{Tokenizer}) = Token
+
+function Base.iterate(t::Tokenizer, _=nothing)
+    tok = _next_token!(t)
+    tok === nothing ? nothing : (tok, nothing)
+end
+
+#-----------------------------------------------------------------------# Internal helpers
+@inline _iseof(t::Tokenizer) = t.pos > ncodeunits(t.data)
+@inline _peek(t::Tokenizer) = @inbounds codeunit(t.data, t.pos)
+@inline _peek(t::Tokenizer, offset::Int) = @inbounds codeunit(t.data, t.pos + offset)
+@inline _canpeek(t::Tokenizer, offset::Int) = t.pos + offset <= ncodeunits(t.data)
+
+@inline function _is_name_byte(b::UInt8)
+    (UInt8('a') <= b <= UInt8('z')) || (UInt8('A') <= b <= UInt8('Z')) ||
+    (UInt8('0') <= b <= UInt8('9')) || b == UInt8('_') || b == UInt8('-') ||
+    b == UInt8('.') || b == UInt8(':')
+end
+
+@inline function _is_whitespace(b::UInt8)
+    b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r')
+end
+
+function _skip_whitespace!(t::Tokenizer)
+    while !_iseof(t) && _is_whitespace(_peek(t))
+        t.pos += 1
+    end
+end
+
+function _skip_quoted!(t::Tokenizer)
+    q = _peek(t)
+    t.pos += 1
+    while !_iseof(t)
+        _peek(t) == q && (t.pos += 1; return)
+        t.pos += 1
+    end
+    error("Unterminated quoted string")
+end
+
+@noinline _err(msg, pos) = throw(ArgumentError("XML tokenizer error at position $pos: $msg"))
+
+#-----------------------------------------------------------------------# Main dispatch
+function _next_token!(t::Tokenizer)
+    if t.pending !== nothing
+        tok = t.pending::Token
+        t.pending = nothing
+        return tok
+    end
+    _iseof(t) && return nothing
+
+    s = t.state
+    if s == _S_DEFAULT
+        _peek(t) == UInt8('<') ? _read_markup!(t) : _read_text!(t)
+    elseif s == _S_TAG || s == _S_XML_DECL
+        _read_in_tag!(t)
+    elseif s == _S_TAG_VALUE || s == _S_XML_DECL_VALUE
+        _read_attr_value!(t)
+    elseif s == _S_CLOSE_TAG
+        _read_close_tag_end!(t)
+    elseif s == _S_COMMENT
+        _read_comment_body!(t)
+    elseif s == _S_CDATA
+        _read_cdata_body!(t)
+    elseif s == _S_PI
+        _read_pi_body!(t)
+    else  # _S_DOCTYPE
+        _read_doctype_body!(t)
+    end
+end
+
+#-----------------------------------------------------------------------# S_DEFAULT tokens
+function _read_text!(t::Tokenizer)
+    start = t.pos
+    while !_iseof(t) && _peek(t) != UInt8('<')
+        t.pos += 1
+    end
+    Token(TOKEN_TEXT, SubString(t.data, start, prevind(t.data, t.pos)))
+end
+
+function _read_markup!(t::Tokenizer)
+    start = t.pos
+    t.pos += 1  # skip '<'
+    _iseof(t) && _err("unexpected end of input after '<'", start)
+
+    b = _peek(t)
+    if b == UInt8('!')
+        _read_bang!(t, start)
+    elseif b == UInt8('?')
+        _read_pi_start!(t, start)
+    elseif b == UInt8('/')
+        _read_close_tag_start!(t, start)
+    else
+        _read_open_tag_start!(t, start)
+    end
+end
+
+#-----------------------------------------------------------------------# <! dispatch
+function _read_bang!(t::Tokenizer, start::Int)
+    t.pos += 1  # skip '!'
+
+    # Comment: <!--
+    if !_iseof(t) && _peek(t) == UInt8('-')
+        t.pos += 1
+        (!_iseof(t) && _peek(t) == UInt8('-')) || _err("expected '<!--'", start)
+        t.pos += 1
+        t.state = _S_COMMENT
+        return Token(TOKEN_COMMENT_OPEN, SubString(t.data, start, t.pos - 1))
+    end
+
+    # CDATA: <![CDATA[
+    if !_iseof(t) && _peek(t) == UInt8('[')
+        t.pos += 1
+        for expected in (UInt8('C'), UInt8('D'), UInt8('A'), UInt8('T'), UInt8('A'), UInt8('['))
+            _iseof(t) && _err("unterminated CDATA", start)
+            _peek(t) == expected || _err("invalid CDATA section", start)
+            t.pos += 1
+        end
+        t.state = _S_CDATA
+        return Token(TOKEN_CDATA_OPEN, SubString(t.data, start, t.pos - 1))
+    end
+
+    # <!DOCTYPE ...> or other <! declaration
+    while !_iseof(t) && _is_name_byte(_peek(t))
+        t.pos += 1
+    end
+    t.state = _S_DOCTYPE
+    Token(TOKEN_DOCTYPE_OPEN, SubString(t.data, start, t.pos - 1))
+end
+
+#-----------------------------------------------------------------------# <? (PI / XML declaration)
+function _read_pi_start!(t::Tokenizer, start::Int)
+    t.pos += 1  # skip '?'
+    name_start = t.pos
+    while !_iseof(t) && _is_name_byte(_peek(t))
+        t.pos += 1
+    end
+
+    is_xml = (t.pos - name_start == 3) &&
+        codeunit(t.data, name_start)     == UInt8('x') &&
+        codeunit(t.data, name_start + 1) == UInt8('m') &&
+        codeunit(t.data, name_start + 2) == UInt8('l')
+
+    if is_xml
+        t.state = _S_XML_DECL
+        Token(TOKEN_XML_DECL_OPEN, SubString(t.data, start, t.pos - 1))
+    else
+        t.state = _S_PI
+        Token(TOKEN_PI_OPEN, SubString(t.data, start, t.pos - 1))
+    end
+end
+
+#-----------------------------------------------------------------------# Tags
+function _read_open_tag_start!(t::Tokenizer, start::Int)
+    while !_iseof(t) && _is_name_byte(_peek(t))
+        t.pos += 1
+    end
+    t.state = _S_TAG
+    Token(TOKEN_OPEN_TAG, SubString(t.data, start, t.pos - 1))
+end
+
+function _read_close_tag_start!(t::Tokenizer, start::Int)
+    t.pos += 1  # skip '/'
+    while !_iseof(t) && _is_name_byte(_peek(t))
+        t.pos += 1
+    end
+    t.state = _S_CLOSE_TAG
+    Token(TOKEN_CLOSE_TAG, SubString(t.data, start, t.pos - 1))
+end
+
+function _read_close_tag_end!(t::Tokenizer)
+    _skip_whitespace!(t)
+    _iseof(t) && _err("unterminated close tag", t.pos)
+    _peek(t) == UInt8('>') || _err("expected '>'", t.pos)
+    start = t.pos
+    t.pos += 1
+    t.state = _S_DEFAULT
+    Token(TOKEN_TAG_CLOSE, SubString(t.data, start, start))
+end
+
+#-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL)
+function _read_in_tag!(t::Tokenizer)
+    _skip_whitespace!(t)
+    _iseof(t) && _err("unterminated tag", t.pos)
+
+    b = _peek(t)
+    is_decl = (t.state == _S_XML_DECL)
+
+    # Check for end delimiters
+    if is_decl
+        if b == UInt8('?') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
+            start = t.pos; t.pos += 2; t.state = _S_DEFAULT
+            return Token(TOKEN_XML_DECL_CLOSE, SubString(t.data, start, t.pos - 1))
+        end
+    else
+        if b == UInt8('>')
+            start = t.pos; t.pos += 1; t.state = _S_DEFAULT
+            return Token(TOKEN_TAG_CLOSE, SubString(t.data, start, start))
+        end
+        if b == UInt8('/') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
+            start = t.pos; t.pos += 2; t.state = _S_DEFAULT
+            return Token(TOKEN_SELF_CLOSE, SubString(t.data, start, t.pos - 1))
+        end
+    end
+
+    # Attribute name
+    name_start = t.pos
+    while !_iseof(t) && _is_name_byte(_peek(t))
+        t.pos += 1
+    end
+    name_end = t.pos - 1
+    name_start > name_end && _err("expected attribute name or tag close", t.pos)
+
+    # Consume '=' and surrounding whitespace (not part of any token)
+    _skip_whitespace!(t)
+    (!_iseof(t) && _peek(t) == UInt8('=')) || _err("expected '=' after attribute name", t.pos)
+    t.pos += 1
+    _skip_whitespace!(t)
+
+    t.state = is_decl ? _S_XML_DECL_VALUE : _S_TAG_VALUE
+    Token(TOKEN_ATTR_NAME, SubString(t.data, name_start, name_end))
+end
+
+function _read_attr_value!(t::Tokenizer)
+    _iseof(t) && _err("expected attribute value", t.pos)
+
+    q = _peek(t)
+    (q == UInt8('"') || q == UInt8('\'')) || _err("expected quoted attribute value", t.pos)
+
+    start = t.pos
+    t.pos += 1  # skip opening quote
+    while !_iseof(t) && _peek(t) != q
+        t.pos += 1
+    end
+    _iseof(t) && _err("unterminated attribute value", start)
+    t.pos += 1  # skip closing quote
+
+    t.state = (t.state == _S_XML_DECL_VALUE) ? _S_XML_DECL : _S_TAG
+    Token(TOKEN_ATTR_VALUE, SubString(t.data, start, prevind(t.data, t.pos)))
+end
+
+#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
+function _read_comment_body!(t::Tokenizer)
+    start = t.pos
+    while !_iseof(t)
+        if _peek(t) == UInt8('-') &&
+           _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
+           _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
+            content_end = prevind(t.data, t.pos)
+            close_start = t.pos
+            t.pos += 3
+            t.state = _S_DEFAULT
+            t.pending = Token(TOKEN_COMMENT_CLOSE, SubString(t.data, close_start, t.pos - 1))
+            return Token(TOKEN_COMMENT_CONTENT, SubString(t.data, start, content_end))
+        end
+        t.pos += 1
+    end
+    _err("unterminated comment", start)
+end
+
+function _read_cdata_body!(t::Tokenizer)
+    start = t.pos
+    while !_iseof(t)
+        if _peek(t) == UInt8(']') &&
+           _canpeek(t, 1) && _peek(t, 1) == UInt8(']') &&
+           _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
+            content_end = prevind(t.data, t.pos)
+            close_start = t.pos
+            t.pos += 3
+            t.state = _S_DEFAULT
+            t.pending = Token(TOKEN_CDATA_CLOSE, SubString(t.data, close_start, t.pos - 1))
+            return Token(TOKEN_CDATA_CONTENT, SubString(t.data, start, content_end))
+        end
+        t.pos += 1
+    end
+    _err("unterminated CDATA section", start)
+end
+
+function _read_pi_body!(t::Tokenizer)
+    start = t.pos
+    while !_iseof(t)
+        if _peek(t) == UInt8('?') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
+            content_end = prevind(t.data, t.pos)
+            close_start = t.pos
+            t.pos += 2
+            t.state = _S_DEFAULT
+            t.pending = Token(TOKEN_PI_CLOSE, SubString(t.data, close_start, t.pos - 1))
+            return Token(TOKEN_PI_CONTENT, SubString(t.data, start, content_end))
+        end
+        t.pos += 1
+    end
+    _err("unterminated processing instruction", start)
+end
+
+function _read_doctype_body!(t::Tokenizer)
+    start = t.pos
+    depth = 0
+    while !_iseof(t)
+        b = _peek(t)
+        if b == UInt8('-') && _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
+                t.pos >= 2 &&
+                codeunit(t.data, t.pos - 1) == UInt8('!') &&
+                codeunit(t.data, t.pos - 2) == UInt8('<')
+            # Inside a <!-- comment: skip until -->
+            t.pos += 2  # skip "--"
+            while !_iseof(t)
+                if _peek(t) == UInt8('-') && _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
+                        _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
+                    t.pos += 3  # skip "-->"
+                    break
+                end
+                t.pos += 1
+            end
+        elseif b == UInt8('"') || b == UInt8('\'')
+            _skip_quoted!(t)
+        elseif b == UInt8('[')
+            depth += 1
+            t.pos += 1
+        elseif b == UInt8(']')
+            depth -= 1
+            t.pos += 1
+        elseif b == UInt8('>') && depth == 0
+            content_end = prevind(t.data, t.pos)
+            close_start = t.pos
+            t.pos += 1
+            t.state = _S_DEFAULT
+            t.pending = Token(TOKEN_DOCTYPE_CLOSE, SubString(t.data, close_start, t.pos - 1))
+            return Token(TOKEN_DOCTYPE_CONTENT, SubString(t.data, start, content_end))
+        else
+            t.pos += 1
+        end
+    end
+    _err("unterminated DOCTYPE", start)
+end
+
+#-----------------------------------------------------------------------# Utility functions
+
+"""
+    tag_name(token::Token) -> SubString{String}
+
+Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token.
+"""
+function tag_name(token::Token)
+    if token.kind == TOKEN_OPEN_TAG
+        SubString(token.raw, 2, ncodeunits(token.raw))  # skip '<'
+    elseif token.kind == TOKEN_CLOSE_TAG
+        SubString(token.raw, 3, ncodeunits(token.raw))  # skip '</'
+    else
+        throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)"))
+    end
+end
+
+"""
+    attr_value(token::Token) -> SubString{String}
+
+Strip the surrounding quotes from an `ATTR_VALUE` token.
+"""
+function attr_value(token::Token)
+    token.kind == TOKEN_ATTR_VALUE ||
+        throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)"))
+    SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw)))
+end
+
+"""
+    pi_target(token::Token) -> SubString{String}
+
+Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token.
+"""
+function pi_target(token::Token)
+    (token.kind == TOKEN_PI_OPEN || token.kind == TOKEN_XML_DECL_OPEN) ||
+        throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)"))
+    SubString(token.raw, 3, ncodeunits(token.raw))  # skip '<?'
+end
+
+end # module XMLTokenizer
diff --git a/src/xpath.jl b/src/xpath.jl
new file mode 100644
index 0000000..b0a9725
--- /dev/null
+++ b/src/xpath.jl
@@ -0,0 +1,300 @@
+#-----------------------------------------------------------------------------# XPath
+# A subset of XPath 1.0 for querying XML.Node trees.
+#
+# Supported syntax:
+#   /            root (absolute path)
+#   tag          child element by name
+#   *            any child element
+#   //           descendant-or-self (recursive)
+#   .            current node
+#   ..           parent node
+#   [n]          positional predicate (1-based)
+#   [@attr]      has-attribute predicate
+#   [@attr='v']  attribute-value predicate
+#   text()       text node children
+#   node()       all node children
+#   @attr        attribute value (returns strings)
+
+#-----------------------------------------------------------------------------# Token types
+
+@enum XPathTokenKind::UInt8 begin
+    XPATH_ROOT           # /
+    XPATH_DESCENDANT     # //
+    XPATH_NAME           # tag name
+    XPATH_WILDCARD       # *
+    XPATH_DOT            # .
+    XPATH_DOTDOT         # ..
+    XPATH_TEXT_FN        # text()
+    XPATH_NODE_FN        # node()
+    XPATH_PREDICATE      # [...]
+    XPATH_ATTRIBUTE      # @attr (in result position)
+end
+
+struct XPathToken
+    kind::XPathTokenKind
+    value::String
+end
+
+#-----------------------------------------------------------------------------# Tokenizer
+
+function _xpath_tokenize(expr::AbstractString)
+    tokens = XPathToken[]
+    s = String(expr)
+    i = 1
+    n = ncodeunits(s)
+
+    while i <= n
+        c = s[i]
+
+        if c == '/'
+            if i < n && s[i+1] == '/'
+                push!(tokens, XPathToken(XPATH_DESCENDANT, "//"))
+                i += 2
+            else
+                push!(tokens, XPathToken(XPATH_ROOT, "/"))
+                i += 1
+            end
+
+        elseif c == '.'
+            if i < n && s[i+1] == '.'
+                push!(tokens, XPathToken(XPATH_DOTDOT, ".."))
+                i += 2
+            else
+                push!(tokens, XPathToken(XPATH_DOT, "."))
+                i += 1
+            end
+
+        elseif c == '*'
+            push!(tokens, XPathToken(XPATH_WILDCARD, "*"))
+            i += 1
+
+        elseif c == '['
+            j = findnext(']', s, i + 1)
+            isnothing(j) && error("Unterminated predicate in XPath: $(repr(s))")
+            push!(tokens, XPathToken(XPATH_PREDICATE, SubString(s, i + 1, j - 1)))
+            i = j + 1
+
+        elseif c == '@'
+            j = i + 1
+            while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]))
+                j += 1
+            end
+            j == i + 1 && error("Empty attribute name after @ in XPath: $(repr(s))")
+            push!(tokens, XPathToken(XPATH_ATTRIBUTE, SubString(s, i + 1, j - 1)))
+            i = j
+
+        elseif isletter(c) || c == '_'
+            j = i + 1
+            while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]) || s[j] == '.')
+                j += 1
+            end
+            name = SubString(s, i, j - 1)
+            # Check for function calls: text(), node()
+            if j <= n && s[j] == '('
+                j2 = findnext(')', s, j + 1)
+                isnothing(j2) && error("Unterminated function call in XPath: $(repr(s))")
+                if name == "text"
+                    push!(tokens, XPathToken(XPATH_TEXT_FN, "text()"))
+                elseif name == "node"
+                    push!(tokens, XPathToken(XPATH_NODE_FN, "node()"))
+                else
+                    error("Unknown XPath function: $name()")
+                end
+                i = j2 + 1
+            else
+                push!(tokens, XPathToken(XPATH_NAME, String(name)))
+                i = j
+            end
+
+        elseif isspace(c)
+            i += 1
+
+        else
+            error("Unexpected character '$(c)' in XPath: $(repr(s))")
+        end
+    end
+    tokens
+end
+
+#-----------------------------------------------------------------------------# Predicate evaluation
+
+function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S
+    s = strip(predicate)
+
+    # Positional: [n]
+    pos = tryparse(Int, s)
+    if !isnothing(pos)
+        1 <= pos <= length(nodes) || return Node{S}[]
+        return [nodes[pos]]
+    end
+
+    # last()
+    if s == "last()"
+        isempty(nodes) && return Node{S}[]
+        return [nodes[end]]
+    end
+
+    # [@attr] — has attribute
+    m = match(r"^@([A-Za-z_:][\w.\-:]*)$", s)
+    if !isnothing(m)
+        attr_name = m.captures[1]
+        return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes)
+    end
+
+    # [@attr='value'] or [@attr="value"]
+    m = match(r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$", s)
+    if !isnothing(m)
+        attr_name = m.captures[1]
+        attr_val = m.captures[2]
+        return filter(n -> n.nodetype === Element && get(n, attr_name, nothing) == attr_val, nodes)
+    end
+
+    error("Unsupported XPath predicate: [$predicate]")
+end
+
+#-----------------------------------------------------------------------------# Step evaluation
+
+function _xpath_step(nodes::Vector{Node{S}}, token::XPathToken, root::Node{S}) where S
+    result = Node{S}[]
+    k = token.kind
+
+    if k === XPATH_NAME
+        for n in nodes
+            for c in children(n)
+                c.nodetype === Element && c.tag == token.value && push!(result, c)
+            end
+        end
+
+    elseif k === XPATH_WILDCARD
+        for n in nodes
+            for c in children(n)
+                c.nodetype === Element && push!(result, c)
+            end
+        end
+
+    elseif k === XPATH_DOT
+        append!(result, nodes)
+
+    elseif k === XPATH_DOTDOT
+        for n in nodes
+            n === root && continue
+            p = _find_parent(n, root)
+            isnothing(p) || push!(result, p)
+        end
+
+    elseif k === XPATH_TEXT_FN
+        for n in nodes
+            for c in children(n)
+                c.nodetype === Text && push!(result, c)
+            end
+        end
+
+    elseif k === XPATH_NODE_FN
+        for n in nodes
+            append!(result, children(n))
+        end
+
+    elseif k === XPATH_DESCENDANT
+        # Handled by caller — collects all descendants before next step
+        error("XPATH_DESCENDANT should be handled by the evaluator, not _xpath_step")
+    end
+
+    result
+end
+
+function _descendants!(out::Vector{Node{S}}, node::Node{S}) where S
+    for c in children(node)
+        push!(out, c)
+        _descendants!(out, c)
+    end
+end
+
+function _descendants(nodes::Vector{Node{S}}) where S
+    result = Node{S}[]
+    for n in nodes
+        push!(result, n)  # descendant-or-self includes self
+        _descendants!(result, n)
+    end
+    result
+end
+
+#-----------------------------------------------------------------------------# Main evaluator
+
+"""
+    xpath(node::Node, expr::AbstractString) -> Vector{Node}
+
+Evaluate an XPath expression against a `Node` tree and return matching nodes.
+
+Supports a practical subset of XPath 1.0:
+- Absolute (`/root/child`) and relative (`child/sub`) paths
+- Recursive descent (`//tag`)
+- Wildcards (`*`), self (`.`), parent (`..`)
+- Positional predicates (`[1]`, `[last()]`)
+- Attribute predicates (`[@attr]`, `[@attr='value']`)
+- `text()` and `node()` functions
+- Attribute selection (`@attr`) — returns `Text` nodes containing attribute values
+
+# Examples
+```julia
+doc = parse("<root><a x='1'/><a x='2'/><b/></root>", Node)
+xpath(doc, "/root/a")          # both <a> elements
+xpath(doc, "/root/a[1]")       # first <a>
+xpath(doc, "//a[@x='2']")      # <a x="2"/>
+xpath(doc, "/root/b/@x")       # attribute value as Text node (empty here)
+```
+"""
+function xpath(node::Node{S}, expr::AbstractString) where S
+    tokens = _xpath_tokenize(expr)
+    isempty(tokens) && return Node{S}[]
+
+    # Determine root for .. navigation
+    root = node.nodetype === Document ? node : node
+
+    i = 1
+    # Start context
+    if tokens[1].kind === XPATH_ROOT
+        # Absolute path — start from the document or its root element
+        if node.nodetype === Document
+            current = Node{S}[node]
+        else
+            current = Node{S}[node]
+        end
+        i = 2
+    else
+        current = Node{S}[node]
+    end
+
+    while i <= length(tokens)
+        tok = tokens[i]
+
+        if tok.kind === XPATH_PREDICATE
+            current = _eval_predicate(tok.value, current, root)
+            i += 1
+
+        elseif tok.kind === XPATH_DESCENDANT
+            current = _descendants(current)
+            # // must be followed by a step
+            i += 1
+
+        elseif tok.kind === XPATH_ROOT
+            # / as separator between steps — skip
+            i += 1
+
+        elseif tok.kind === XPATH_ATTRIBUTE
+            # @attr in result position — return attribute values as Text nodes
+            result = Node{S}[]
+            for n in current
+                v = get(n, tok.value, nothing)
+                !isnothing(v) && push!(result, Node{S}(Text, nothing, nothing, v, nothing))
+            end
+            current = result
+            i += 1
+
+        else
+            current = _xpath_step(current, tok, root)
+            i += 1
+        end
+    end
+
+    current
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 89978eb..1304245 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,646 +1,2682 @@
 using XML
-using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape, OrderedDict, h
-using Downloads: download
+using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
+using XML: escape, unescape, h, parse_dtd
+using XML: ParsedDTD, ElementDecl, AttDecl, EntityDecl, NotationDecl
 using Test
-import AbstractTrees
-
-AbstractTrees.children(x::Node) = children(x)
-
-#-----------------------------------------------------------------------------# files
-xml_xsd = joinpath("data", "xml.xsd")
-kml_xsd = joinpath("data", "kml.xsd")
-books_xml = joinpath("data", "books.xml")
-example_kml = joinpath("data", "example.kml")
-simple_dtd = joinpath("data", "simple_dtd.xml")
-
-all_files = [xml_xsd, kml_xsd, books_xml, example_kml, simple_dtd]
-
-#-----------------------------------------------------------------------------# h
-@testset "h function" begin
-    @test h.tag == XML.Element("tag")
-    @test h.tag(id="id") == XML.Element("tag"; id="id")
-    @test h.tag(1, 2, a="a", b="b") == XML.Element("tag", 1, 2; a="a", b="b")
-end
-
-#-----------------------------------------------------------------------------# escaping/unescaping
-@testset "escaping/unescaping" begin
-    s = "This > string < has & some \" special ' characters"
-    @test escape(s) == "This &gt; string &lt; has &amp; some &quot; special &apos; characters"
-    @test escape(escape(s)) == escape(s)
-    @test s == unescape(escape(s))
-    @test s == unescape(unescape(escape(s)))
-
-    n = Element("tag", Text(s))
-    @test XML.simple_value(n) == s
-
-    XML.escape!(n)
-    @test XML.simple_value(n) == escape(s)
-
-    XML.unescape!(n)
-    @test XML.simple_value(n) == s
-end
-
-#-----------------------------------------------------------------------------# DTD
-# @testset "DTDBody and friends" begin
-#     s = read(simple_dtd, String)
-#     data = read(simple_dtd)
-
-#     dtd = XML.DTDBody(data)
-#     dtd2 = parse(s, XML.DTDBody)
-
-#     @test length(dtd.elements) == length(dtd2.elements) == 0
-#     @test length(dtd.attributes) == length(dtd2.attributes) == 0
-#     @test length(dtd.entities) == length(dtd2.entities) == 3
-
-#     o = read("data/tv.dtd", XML.DTDBody)
-# end
-
-#-----------------------------------------------------------------------------# Raw
-@testset "Raw tag/attributes/value" begin
-    examples = [
-        (xml = "<!DOCTYPE html>",
-            nodetype = DTD,
-            tag=nothing,
-            attributes=nothing,
-            value="html"),
-        (xml = "<?xml version=\"1.0\" key=\"value\"?>",
-            nodetype = Declaration,
-            tag=nothing,
-            attributes=Dict("version" => "1.0", "key" => "value"),
-            value=nothing),
-        (xml = "<tag _id=\"1\", x=\"abc\" />",
-            nodetype = Element,
-            tag="tag",
-            attributes=Dict("_id" => "1", "x" => "abc"),
-            value=nothing),
-        (xml = "<!-- comment -->",
-            nodetype = Comment,
-            tag=nothing,
-            attributes=nothing,
-            value=" comment "),
-        (xml = "<![CData[cdata test]]>",
-            nodetype = CData,
-            tag=nothing,
-            attributes=nothing,
-            value="cdata test"),
-    ]
-    for x in examples
-        # @info "Testing: $(x.xml)"
-        data = XML.next(XML.parse(x.xml, XML.Raw))
-        @test XML.nodetype(data) == x.nodetype
-        @test XML.tag(data) == x.tag
-        @test XML.attributes(data) == x.attributes
-        @test XML.value(data) == x.value
-    end
-end
-
-@testset "Raw with books.xml" begin
-    data = read(books_xml, XML.Raw)
-    doc = collect(data)
-    @test length(doc) > countlines(books_xml)
-    # Check that the first 5 lines are correct
-    first_5_lines = [
-        XML.RawDeclaration => """<?xml version="1.0"?>""",
-        XML.RawElementOpen => "<catalog>",
-        XML.RawElementOpen => "<book id=\"bk101\">",
-        XML.RawElementOpen => "<author>",
-        XML.RawText => "Gambardella, Matthew"
-    ]
-    for (i, (typ, str)) in enumerate(first_5_lines)
-        dt = doc[i]
-        @test dt.type == typ
-        @test String(dt) == str
-    end
-    # Check that the last line is correct
-    @test doc[end].type == XML.RawElementClose
-    @test String(doc[end]) == "</catalog>"
-
-    @testset "next and prev" begin
-        @test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx
-        @test prev(data) === nothing
-        @test XML.next(doc[end]) === nothing
-
-        n = length(doc)
-        next_res = [doc[1]]
-        foreach(_ -> push!(next_res, XML.next(next_res[end])), 1:n-1)
-
-        prev_res = [doc[end]]
-        foreach(_ -> pushfirst!(prev_res, XML.prev(prev_res[1])), 1:n-1)
-
-        idx = findall(next_res .!= prev_res)
-
-        for (a,b) in zip(next_res, prev_res)
-            @test a == b
-        end
-
-        lzxml = """<root><text>    </text><text2>  hello  </text2><text3 xml:space="preserve">  hello  <text3b>  preserve  </text3b></text3><text4 xml:space="preserve"></text4><text5/></root>"""
-        lz = XML.parse(XML.LazyNode, lzxml)
-        n=XML.next(lz)
-        n=XML.next(n)
-        text_content = XML.write(n)
-       @test text_content == "<text/>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text2>hello</text2>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "hello"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3 xml:space=\"preserve\">  hello  <text3b>  preserve  </text3b></text3>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "hello"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3 xml:space=\"preserve\">  hello  <text3b>  preserve  </text3b></text3>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "  hello  "
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3b>  preserve  </text3b>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "  preserve  "
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text4 xml:space=\"preserve\"/>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "<text5/>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text4 xml:space=\"preserve\"/>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "  preserve  "
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3b>  preserve  </text3b>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "  hello  "
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3 xml:space=\"preserve\">  hello  <text3b>  preserve  </text3b></text3>"
-        n=XML.next(n)
-        text_content = XML.write(n)
-        @test text_content == "  hello  "
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text3 xml:space=\"preserve\">  hello  <text3b>  preserve  </text3b></text3>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "hello"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text2>hello</text2>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<text/>"
-        n=XML.prev(n)
-        text_content = XML.write(n)
-        @test text_content == "<root>\n  <text/>\n  <text2>hello</text2>\n  <text3 xml:space=\"preserve\">  hello  <text3b>  preserve  </text3b></text3>\n  <text4 xml:space=\"preserve\"/>\n  <text5/>\n</root>"
-    end
-
-    @testset "depth and parent" begin
-        @test XML.depth(data) == 0
-        @test isnothing(XML.parent(data))
-        @test XML.depth(doc[1]) == 1
-        @test XML.parent(doc[1]) == data
-        @test XML.depth(doc[2]) == 1
-        @test XML.depth(doc[3]) == 2
-        @test XML.parent(doc[3]) == doc[2]
-        @test XML.depth(doc[end]) == 1
-        @test XML.parent(doc[end]) == data
-    end
-
-    @testset "tag/attributes/value" begin
-        x = doc[1]  # <?xml version="1.0"?>
-        @test XML.tag(x) === nothing
-        @test XML.attributes(x) == Dict("version" => "1.0")
-        @test XML.value(x) === nothing
-
-        x = XML.next(x)  # <catalog>
-        @test XML.tag(x) == "catalog"
-        @test XML.attributes(x) === nothing
-        @test XML.value(x) === nothing
-
-        x = XML.next(x)  # <book id="bk101">
-        @test XML.tag(x) == "book"
-        @test XML.attributes(x) == Dict("id" => "bk101")
-        @test XML.value(x) === nothing
-
-        x = XML.next(x)  # <author>
-        @test XML.tag(x) == "author"
-        @test XML.attributes(x) === nothing
-        @test XML.value(x) === nothing
-
-        x = XML.next(x)  # Gambardella, Matthew
-        @test XML.tag(x) === nothing
-        @test XML.attributes(x) === nothing
-        @test XML.value(x) == "Gambardella, Matthew"
-    end
-end
-
-#-----------------------------------------------------------------------------# Preserve whitespace
-@testset "xml:space" begin
-    @testset "Basic xml:space functionality" begin
-
-        # Test 1: xml:space="preserve" should preserve entirely empty whitespace
-        xml1 = """<root><text xml:space="preserve">   </text></root>"""
-        doc1 = parse(XML.Node, xml1)
-        text_content = XML.value(doc1[1][1][1])
-        @test text_content == "   "
-
-        # Test 2: xml:space="preserve" should preserve leading and trailing whitespace
-        xml2 = """<root><text xml:space="preserve">  leading and trailing spaces  </text></root>"""
-        doc2 = parse(XML.Node, xml2)
-        text_content = XML.value(doc2[1][1][1])
-        @test text_content == "  leading and trailing spaces  "
-        
-        # Test 3: Entirely empty tags with and without xml:space="preserve" become self-closing
-        xml3 = """<root><text>    </text><text2 xml:space="preserve">    </text2><text3 xml:space="preserve"></text3><text4/></root>"""
-        doc3 = XML.parse(XML.Node, xml3)
-        text_content = XML.write(doc3[1][1])
-        @test text_content == "<text/>" # without xml:space="preserve", empty text becomes self-closing
-        text_content = XML.value(doc3[1][2][1])
-        @test text_content == "    " # with xml:space, whitespace is preserved
-        text_content = XML.write(doc3[1][3])
-        @test text_content == "<text3 xml:space=\"preserve\"/>" # with xml:space="preserve", empty text becomes self-closing
-
-        # Test 4: Without xml:space, whitespace should be normalized
-        xml4 = """<root><text>  gets normalized  </text></root>"""
-        doc4 = XML.parse(XML.Node, xml4)
-        text_content = XML.value(doc4[1][1][1])
-        @test text_content == "gets normalized"
-        
-        # Test 5: xml:space="default" should normalize even with preserve_xml_space=true
-        xml5 = """<root><text xml:space="default">  gets normalized  </text></root>"""
-        doc5 = XML.parse(XML.Node, xml5)
-        text_content = XML.value(doc5[1][1][1])
-        @test text_content == "gets normalized"
-    end
-    
-    @testset "xml:space inheritance" begin
-        # Test 6: Children inherit parent's xml:space="preserve"
-        xml6 = """<root xml:space="preserve">
-            <parent>  parent text  
-                <child>  child text  </child>
-            </parent>
-        </root>"""
-        doc6 = XML.parse(XML.Node, xml6)
-        # Both parent and child should preserve whitespace
-        @test contains(XML.value(doc6[1][2][1]), "parent text  \n")
-        @test XML.value(doc6[1][2][2][1]) == "  child text  "
-        
-        # Test 7: xml:space="default" overrides parent's "preserve"
-        xml7 = """<root xml:space="preserve">
-            <child xml:space="default">  normalized despite parent  </child>
-        </root>"""
-        doc7 = XML.parse(XML.Node, xml7)
-        @test XML.value(doc7[1][2][1]) == "normalized despite parent"
-    end
-    
-    @testset "Nesting scenarios" begin
-        # Test 8: Multiple levels of xml:space changes
-        xml8 = """<root xml:space="preserve">
-            <level1>  preserved  
-                <level2 xml:space="default">  normalized  
-                    <level3 xml:space="preserve">  preserved again  </level3>
-                </level2>
-            </level1>
-        </root>"""
-        doc8 = XML.parse(XML.Node, xml8)
-        
-        # level1 should preserve (inherits from root)
-        level1_text = XML.value(doc8[1][2][1])
-        @test level1_text == "  preserved  \n        "
-        
-        # level2 should normalize (explicit xml:space="default")
-        level2_text = XML.value(doc8[1][2][2][1])
-        @test level2_text == "normalized"
-        
-        # level3 should preserve (explicit xml:space="preserve")
-        level3_text = XML.value(doc8[1][2][2][2][1])
-        @test level3_text == "  preserved again  "
-
-        # Test 9: repeated multiple levels of xml:space changes
-        xml9 = """<root xml:space="preserve">
-            <level1>  preserved  
-                <level2 xml:space="default">  normalized  
-                    <level3 xml:space="preserve">  preserved again  </level3>
-                </level2>
-            </level1>  
-            <level1b>  preserved b  
-                <level2b xml:space="default">  normalized b 
-                    <level3b xml:space="preserve">  preserved again b  </level3b>
-                </level2b>
-            </level1b>
-        </root>"""
-        doc9 = XML.parse(XML.Node, xml9)
-
-        # level1b should preserve (inherits from root)
-        level1b_text = XML.value(doc9[1][4][1])
-        @test level1b_text == "  preserved b  \n        "
-        
-        # level2 should normalize (explicit xml:space="default")
-        level2b_text = XML.value(doc9[1][4][2][1])
-        @test level2b_text == "normalized b"
-        
-        # level3 should preserve (explicit xml:space="preserve")
-        level3b_text = XML.value(doc9[1][4][2][2][1])
-        @test level3b_text == "  preserved again b  "
-
-        # Test 10: futher repeated multiple levels of xml:space changes
-        xml10 = """<root>
-            <level1>  normalized  
-                <level2>  normalized b  
-                    <level3 xml:space="preserve">  preserved   </level3>
-                </level2>
-            </level1>  
-            <level1b>  normalized c  
-                <level2b xml:space="preserve">  preserved b 
-                    <level3b xml:space="default">  normalized again b  </level3b>
-                    <level3c>  preserved c 
-                    </level3c>
-                </level2b>
-            </level1b>
-            <level1c>  normalized d   </level1c>
-        </root>"""
-        doc10 = XML.parse(XML.Node, xml10)
-        
-        # level1 should normalize (as root)
-        level1_text = XML.value(doc10[end][1][1])
-        @test level1_text == "normalized"
-        
-        # level2 should normalize (as root and level1)
-        level2_text = XML.value(doc10[end][1][2][1])
-        @test level2_text == "normalized b"
-        
-        # level3 should preserve (explicit xml:space="preserve")
-        level3_text = XML.value(doc10[end][1][2][2][1])
-        @test level3_text == "  preserved   "
-        
-        # level1b should normalize (as root)
-        level1b_text = XML.value(doc10[end][2][1])
-        @test level1b_text == "normalized c"
-        
-        # level2b should preserve (explicit xml:space="preserve")
-        level2b_text = XML.value(doc10[end][2][2][1])
-        @test level2b_text == "  preserved b \n            "
-        
-        # level3 should normalize (explicit xml:space="default")
-        level3b_text = XML.value(doc10[end][2][2][2][1])
-        @test level3b_text == "normalized again b"
-        
-        # level3c should preserve (inherited from level2b)
-        level3c_text = XML.value(doc10[end][2][2][4][1])
-        @test level3c_text == "  preserved c \n            "
-        
-        # level1c should normalize (as root)
-        level1c_text = XML.value(doc10[end][3][1])
-        @test level1c_text == "normalized d"
-    end
-    @testset "inter-element gap semantics" begin
-        # Default parent: gap between siblings should be dropped
-        s1 = """<root><a> x </a>
-                <b> y </b></root>"""
-        d1 = XML.parse(XML.Node, s1)
-        @test length(d1[1]) == 2
-        @test XML.value(d1[1][1][1]) == "x"
-        @test XML.value(d1[1][2][1]) == "y"
-
-        # Preserve parent, default child ends: gap after default child dropped
-        s2 = """<root xml:space="preserve">
-                  <p> keep  </p>
-                  <q xml:space="default">  norm  </q>
-                  <r>  after default gap  </r>
-                </root>"""
-        d2 = XML.parse(XML.Node, s2)
-        @test length(d2[1]) == 7
-        @test XML.value(d2[1][1]) == "\n  "
-        @test XML.value(d2[1][2][1]) == " keep  "
-        @test XML.value(d2[1][3]) == "\n  "
-        @test XML.value(d2[1][4][1]) == "norm"
-        @test XML.value(d2[1][5]) == "\n  "
-        @test XML.value(d2[1][6][1]) == "  after default gap  "
-        @test XML.value(d2[1][7]) == "\n"
-    end
-    @testset "XML whitespace vs Unicode whitespace" begin
+
+#==============================================================================#
+#                              ESCAPE / UNESCAPE                               #
+#==============================================================================#
+@testset "escape / unescape" begin
+    @testset "all five predefined entities" begin
+        @test escape("&") == "&amp;"
+        @test escape("<") == "&lt;"
+        @test escape(">") == "&gt;"
+        @test escape("'") == "&apos;"
+        @test escape("\"") == "&quot;"
+    end
+
+    @testset "unescape reverses escape" begin
+        @test unescape("&amp;") == "&"
+        @test unescape("&lt;") == "<"
+        @test unescape("&gt;") == ">"
+        @test unescape("&apos;") == "'"
+        @test unescape("&quot;") == "\""
+    end
+
+    @testset "roundtrip on mixed strings" begin
+        s = "This > string < has & some \" special ' characters"
+        @test unescape(escape(s)) == s
+    end
+
+    @testset "idempotent unescape" begin
+        s = "plain text with no entities"
+        @test unescape(s) == s
+    end
+
+    @testset "multiple entities in one string" begin
+        @test escape("a < b & c > d") == "a &lt; b &amp; c &gt; d"
+        @test unescape("a &lt; b &amp; c &gt; d") == "a < b & c > d"
+    end
+
+    @testset "empty string" begin
+        @test escape("") == ""
+        @test unescape("") == ""
+    end
+end
+
+#==============================================================================#
+#              XML 1.0 SPEC SECTION 2.1: Well-Formed XML Documents             #
+#==============================================================================#
+@testset "Spec 2.1: Well-Formed XML Documents" begin
+    # The spec's simplest example:
+    #   <?xml version="1.0"?>
+    #   <greeting>Hello, world!</greeting>
+    xml = """<?xml version="1.0"?><greeting>Hello, world!</greeting>"""
+    doc = parse(xml, Node)
+    @test nodetype(doc) == Document
+    @test length(doc) == 2  # Declaration + Element
+    @test nodetype(doc[1]) == Declaration
+    @test nodetype(doc[2]) == Element
+    @test tag(doc[2]) == "greeting"
+    @test simple_value(doc[2]) == "Hello, world!"
+end
+
+#==============================================================================#
+#         XML 1.0 SPEC SECTION 2.4: Character Data and Markup                  #
+#==============================================================================#
+@testset "Spec 2.4: Character Data and Markup" begin
+    @testset "text content between tags" begin
+        doc = parse("<root>Hello</root>", Node)
+        @test simple_value(doc[1]) == "Hello"
+    end
+
+    @testset "entity references in text are unescaped" begin
+        doc = parse("<root>&amp; &lt; &gt; &apos; &quot;</root>", Node)
+        @test simple_value(doc[1]) == "& < > ' \""
+    end
+
+    @testset "mixed text and child elements" begin
+        doc = parse("<p>Hello <b>world</b>!</p>", Node)
+        root = doc[1]
+        @test length(root) == 3
+        @test nodetype(root[1]) == Text
+        @test value(root[1]) == "Hello "
+        @test nodetype(root[2]) == Element
+        @test tag(root[2]) == "b"
+        @test simple_value(root[2]) == "world"
+        @test nodetype(root[3]) == Text
+        @test value(root[3]) == "!"
+    end
+
+    @testset "empty element has no text" begin
+        doc = parse("<empty/>", Node)
+        @test length(children(doc[1])) == 0
+    end
+end
+
+#==============================================================================#
+#                    XML 1.0 SPEC SECTION 2.5: Comments                        #
+#==============================================================================#
+@testset "Spec 2.5: Comments" begin
+    @testset "basic comment (spec example)" begin
+        # Spec example: <!-- declarations for <head> & <body> -->
+        doc = parse("<root><!-- declarations for <head> &amp; <body> --></root>", Node)
+        c = doc[1][1]
+        @test nodetype(c) == Comment
+        @test value(c) == " declarations for <head> &amp; <body> "
+    end
+
+    @testset "empty comment" begin
+        doc = parse("<root><!----></root>", Node)
+        c = doc[1][1]
+        @test nodetype(c) == Comment
+        @test value(c) == ""
+    end
+
+    @testset "comment before root element" begin
+        doc = parse("<!-- before --><root/>", Node)
+        @test nodetype(doc[1]) == Comment
+        @test value(doc[1]) == " before "
+        @test nodetype(doc[2]) == Element
+    end
+
+    @testset "comment after root element" begin
+        doc = parse("<root/><!-- after -->", Node)
+        @test nodetype(doc[1]) == Element
+        @test nodetype(doc[2]) == Comment
+    end
+
+    @testset "comment with markup-like content preserved verbatim" begin
+        doc = parse("<root><!-- <b>not</b> a tag --></root>", Node)
+        @test value(doc[1][1]) == " <b>not</b> a tag "
+    end
+
+    @testset "multiple comments" begin
+        doc = parse("<root><!-- A --><!-- B --></root>", Node)
+        @test length(doc[1]) == 2
+        @test value(doc[1][1]) == " A "
+        @test value(doc[1][2]) == " B "
+    end
+end
+
+#==============================================================================#
+#             XML 1.0 SPEC SECTION 2.6: Processing Instructions                #
+#==============================================================================#
+@testset "Spec 2.6: Processing Instructions" begin
+    @testset "xml-stylesheet PI (spec example)" begin
+        doc = parse("""<?xml-stylesheet type="text/xsl" href="style.xsl"?><root/>""", Node)
+        pi = doc[1]
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "xml-stylesheet"
+        @test contains(value(pi), "type=\"text/xsl\"")
+    end
+
+    @testset "PI with no content" begin
+        doc = parse("<?target?><root/>", Node)
+        pi = doc[1]
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "target"
+        @test value(pi) === nothing
+    end
+
+    @testset "PI inside element" begin
+        doc = parse("<root><?mypi some data?></root>", Node)
+        pi = doc[1][1]
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "mypi"
+        @test value(pi) == "some data"
+    end
+
+    @testset "PI after root element" begin
+        doc = parse("<root/><?post-process?>", Node)
+        @test nodetype(doc[2]) == ProcessingInstruction
+        @test tag(doc[2]) == "post-process"
+    end
+end
+
+#==============================================================================#
+#                XML 1.0 SPEC SECTION 2.7: CDATA Sections                      #
+#==============================================================================#
+@testset "Spec 2.7: CDATA Sections" begin
+    @testset "CDATA preserves markup characters" begin
+        # Spec example
+        doc = parse("<root><![CDATA[<greeting>Hello, world!</greeting>]]></root>", Node)
+        cd = doc[1][1]
+        @test nodetype(cd) == CData
+        @test value(cd) == "<greeting>Hello, world!</greeting>"
+    end
+
+    @testset "empty CDATA" begin
+        doc = parse("<root><![CDATA[]]></root>", Node)
+        cd = doc[1][1]
+        @test nodetype(cd) == CData
+        @test value(cd) == ""
+    end
+
+    @testset "CDATA with ampersands and less-thans" begin
+        doc = parse("<root><![CDATA[a < b && c > d]]></root>", Node)
+        @test value(doc[1][1]) == "a < b && c > d"
+    end
+
+    @testset "CDATA with special characters" begin
+        doc = parse("<root><![CDATA[line1\nline2\ttab]]></root>", Node)
+        @test value(doc[1][1]) == "line1\nline2\ttab"
+    end
+
+    @testset "CDATA mixed with text" begin
+        doc = parse("<root>before<![CDATA[inside]]>after</root>", Node)
+        @test length(doc[1]) == 3
+        @test nodetype(doc[1][1]) == Text
+        @test value(doc[1][1]) == "before"
+        @test nodetype(doc[1][2]) == CData
+        @test value(doc[1][2]) == "inside"
+        @test nodetype(doc[1][3]) == Text
+        @test value(doc[1][3]) == "after"
+    end
+end
+
+#==============================================================================#
+#        XML 1.0 SPEC SECTION 2.8: Prolog and Document Type Declaration        #
+#==============================================================================#
+@testset "Spec 2.8: Prolog and Document Type Declaration" begin
+    @testset "XML declaration - version only" begin
+        doc = parse("""<?xml version="1.0"?><root/>""", Node)
+        decl = doc[1]
+        @test nodetype(decl) == Declaration
+        @test decl["version"] == "1.0"
+    end
+
+    @testset "XML declaration - version and encoding" begin
+        doc = parse("""<?xml version="1.0" encoding="UTF-8"?><root/>""", Node)
+        decl = doc[1]
+        @test decl["version"] == "1.0"
+        @test decl["encoding"] == "UTF-8"
+    end
+
+    @testset "XML declaration - all three pseudo-attributes" begin
+        doc = parse("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root/>""", Node)
+        decl = doc[1]
+        @test decl["version"] == "1.0"
+        @test decl["encoding"] == "UTF-8"
+        @test decl["standalone"] == "yes"
+    end
+
+    @testset "XML declaration with single quotes" begin
+        doc = parse("<?xml version='1.0'?><root/>", Node)
+        @test doc[1]["version"] == "1.0"
+    end
+
+    @testset "no XML declaration" begin
+        doc = parse("<root/>", Node)
+        @test length(doc) == 1
+        @test nodetype(doc[1]) == Element
+    end
+
+    @testset "DOCTYPE - SYSTEM" begin
+        # Spec example
+        doc = parse("""<!DOCTYPE greeting SYSTEM "hello.dtd"><greeting/>""", Node)
+        dtd = doc[1]
+        @test nodetype(dtd) == DTD
+        @test contains(value(dtd), "greeting")
+        @test contains(value(dtd), "SYSTEM")
+        @test contains(value(dtd), "hello.dtd")
+    end
+
+    @testset "DOCTYPE - with internal subset" begin
+        xml = """<!DOCTYPE greeting [
+  <!ELEMENT greeting (#PCDATA)>
+]><greeting>Hello, world!</greeting>"""
+        doc = parse(xml, Node)
+        dtd = doc[1]
+        @test nodetype(dtd) == DTD
+        @test contains(value(dtd), "greeting")
+        @test contains(value(dtd), "<!ELEMENT")
+    end
+
+    @testset "DOCTYPE with entities (spec-like)" begin
+        xml = """<!DOCTYPE note [
+<!ENTITY nbsp "&#xA0;">
+<!ENTITY writer "Writer: Donald Duck.">
+<!ENTITY copyright "Copyright: W3Schools.">
+]><note/>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc[1]) == DTD
+        @test contains(value(doc[1]), "ENTITY")
+    end
+
+    @testset "full prolog: declaration + DOCTYPE" begin
+        xml = """<?xml version="1.0"?><!DOCTYPE root SYSTEM "root.dtd"><root/>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc[1]) == Declaration
+        @test nodetype(doc[2]) == DTD
+        @test nodetype(doc[3]) == Element
+    end
+end
+
+#==============================================================================#
+#          XML 1.0 SPEC SECTION 2.9: Standalone Document Declaration           #
+#==============================================================================#
+@testset "Spec 2.9: Standalone Document Declaration" begin
+    doc = parse("""<?xml version="1.0" standalone="yes"?><root/>""", Node)
+    @test doc[1]["standalone"] == "yes"
+
+    doc2 = parse("""<?xml version="1.0" standalone="no"?><root/>""", Node)
+    @test doc2[1]["standalone"] == "no"
+end
+
+#==============================================================================#
+#              XML 1.0 SPEC SECTION 2.10: White Space Handling                 #
+#==============================================================================#
+@testset "Spec 2.10: White Space Handling" begin
+    @testset "parser preserves all text content verbatim" begin
+        doc = parse("<root>  hello  </root>", Node)
+        @test simple_value(doc[1]) == "  hello  "
+    end
+
+    @testset "parser preserves whitespace-only text" begin
+        doc = parse("<root>   </root>", Node)
+        @test simple_value(doc[1]) == "   "
+    end
+
+    @testset "parser preserves inter-element whitespace as Text nodes" begin
+        xml = "<root><a>x</a>\n  <b>y</b></root>"
+        doc = parse(xml, Node)
+        @test length(doc[1]) == 3
+        @test value(doc[1][1][1]) == "x"
+        @test nodetype(doc[1][2]) == Text
+        @test value(doc[1][2]) == "\n  "
+        @test value(doc[1][3][1]) == "y"
+    end
+
+    @testset "xml:space attribute is preserved during parsing" begin
+        doc = parse("""<root xml:space="preserve"><child>  text  </child></root>""", Node)
+        @test doc[1]["xml:space"] == "preserve"
+        @test value(doc[1][1][1]) == "  text  "
+    end
+
+    @testset "xml:space='preserve' affects write formatting" begin
+        # When xml:space="preserve", writer doesn't add indentation
+        el = Element("s", XML.Text(" pre "), Element("t"), XML.Text(" post "); var"xml:space"="preserve")
+        @test XML.write(el) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
+    end
+
+    @testset "write formats with indentation by default" begin
+        el = Element("root", Element("a"), Element("b"))
+        s = XML.write(el)
+        @test contains(s, "  <a/>")  # indented
+        @test contains(s, "  <b/>")  # indented
+    end
+
+    @testset "Unicode non-breaking space is NOT XML whitespace" begin
         nbsp = "\u00A0"
-        s = """<root>
-                 <a>  x\t\n  </a>
-                 <b>$(nbsp) y $(nbsp)</b>
-                 <c xml:space="default">$(nbsp)  z  $(nbsp)</c>
-               </root>"""
-        d = XML.parse(XML.Node, s)
-        @test XML.value(d[1][1][1]) == "x"
-        @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
-        @test XML.value(d[1][3][1]) == "$(nbsp)  z  $(nbsp)"
-    end
-
-    @testset "CDATA/Comment/PI boundaries" begin
-        s = """<root>
-                 <a xml:space="default">  pre  <![CDATA[  mid  ]]>  post  </a>
-                 <b xml:space="preserve">  pre  <!-- cmt -->  post  </b>
-                 <?xml-stylesheet type="text/css" href="style.css"?>
-               </root>"""
-        d = XML.parse(XML.Node, s)
-        @test XML.value(d[1][1][1]) == "pre"
-        @test nodetype(d[1][1][2]) == XML.CData
-        @test XML.value(d[1][1][3]) == "post"
-        @test XML.value(d[1][2][1]) == "  pre  "
-        @test nodetype(d[1][2][2]) == XML.Comment
-        @test XML.value(d[1][2][3]) == "  post  "
-        @test nodetype(d[1][3]) == XML.ProcessingInstruction
-    end
-
-    @testset "nested toggles and sibling sequences" begin
-        s = """<root xml:space="preserve">
-                 <x>  a  
-                   <y xml:space="default">  b  
-                     <z xml:space="preserve">  c  </z>
-                   </y>
-                   <y2 xml:space="default">  d  </y2>
-                   <w>  e  </w>
-                 </x>
-               </root>"""
-        d = XML.parse(XML.Node, s)
-        @test XML.value(d[1][2][1]) == "  a  \n    "
-        @test XML.value(d[1][2][2][1]) == "b"
-        @test XML.value(d[1][2][2][2][1]) == "  c  "
-        @test d[1][2][4].tag == "y2"
-        @test XML.value(d[1][2][4][1]) == "d"
-        @test d[1][2][6].tag == "w"
-        @test XML.value(d[1][2][6][1]) == "  e  "
-    end
-
-    @testset "root/document boundaries" begin
-        s = "\n  \n<root>  a  </root>\n \t "
-        d = XML.parse(XML.Node, s)
-        @test length(d) == 1
-        @test XML.value(d[1][1]) == "a"
-    end
-
-    @testset "entities expanding to whitespace" begin
-        chr1="\u0020"
-        chr2="\u000A"
-        chr3="\u00A0"
-        
-        s = """<root>
-                 <a> $(chr1) a $(chr2) </a>
-                 <b xml:space="preserve">$(chr1) b $(chr2)</b>
-                 <c>$(chr3)c$(chr3)</c>
-               </root>"""
-        d = XML.parse(XML.Node, s)
-        @test XML.value(d[1][1][1]) == "a"
-        @test XML.value(d[1][2][1]) == "  b \n"
-        @test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)"
-    end
-
-    @testset "invalid values and placement" begin
-        s_bad = """<root><x xml:space="weird"> t </x></root>"""
-        @test_throws ErrorException XML.parse(XML.Node, s_bad)
-
-        s_pi = """<?pi xml:space="preserve"?><root> t </root>"""
-        d = XML.parse(XML.Node, s_pi)
-        @test XML.value(d[end][1]) == "t"
-
-        s_dup = """<root><x xml:space="preserve" xml:space="default">  t  </x></root>"""
-#        @test_throws ErrorException XML.parse(XML.Node, s_dup)
-    end
-
-    @testset "prev()/next() symmetry" begin
-        xml = """<root xml:space="preserve">
-                    <a>  a  <b xml:space="default">  b  </b>  <c>  c  </c>  </a>
-                    <d xml:space="default">  d  <e xml:space="preserve">  e  </e>  f  </d>
-                    <g><h/><i xml:space="preserve">  i  </i><j/></g>
-                 </root>"""
-        r = XML.parse(XML.LazyNode, xml).raw
-        toks=XML.Raw[]
-        while true
-            n = XML.next(r)
-            n === nothing && break
-            push!(toks, n)
-            r=n
-        end
-        back = XML.Raw[]
-        r = toks[end]
-        while true
-            p = XML.prev(r)
-            p === nothing && break
-            push!(back, p)
-            r = p
-        end
-        @test reverse(back)[2:end] == toks[1:end-1]
-    end
-
-    @testset "write/read roundtrip extremes" begin
-        xml = """<root>
-                   <p xml:space="preserve">    </p>
-                   <q>   </q>
-                   <r xml:space="default">  r  </r>
-                   <s xml:space="preserve"> pre <t/> post </s>
-                 </root>"""
-        n = XML.parse(XML.Node, xml)
-        io = IOBuffer(); XML.write(io, n)
-        n2 = XML.parse(XML.Node, String(take!(io)))
-        @test n == n2
-        @test XML.write(n2[1][1]) == "<p xml:space=\"preserve\">    </p>"
-        @test XML.write(n2[1][2]) == "<q/>"
-        @test XML.value(n2[1][3][1]) == "r"
-        @test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>"
-   end
-
-    @testset "self-closing/empty/whitespace-only children" begin
-        s = """<root>
-                 <a xml:space="default">    </a>
-                 <b xml:space="preserve"></b>
-                 <c xml:space="preserve">   </c>
-                 <d><e/></d>
-                 <f> x <g/> y </f>
-               </root>"""
-        d = XML.parse(XML.Node, s)
-        @test XML.write(d[1][1]) == "<a xml:space=\"default\"/>"
-        @test XML.write(d[1][2]) == "<b xml:space=\"preserve\"/>"
-        @test XML.value(d[1][3][1]) == "   "
-        @test XML.value(d[1][5][1]) == "x"
-        @test XML.value(d[1][5][3]) == "y"
-    end
-
-    @testset "allocation guard: small xml:space doc" begin
-        xml = "<root><a xml:space=\"default\"> x </a><b xml:space=\"preserve\"> y </b></root>"
-        f() = XML.parse(XML.Node, xml)
-        a = @allocated f()
-        @test a < 500_000  # tune for CI
-    end
-
-end
-
-#-----------------------------------------------------------------------------# roundtrip
-@testset "read/write/read roundtrip" begin
+        xml = "<root>$(nbsp) y $(nbsp)</root>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "$(nbsp) y $(nbsp)"
+    end
+end
+
+#==============================================================================#
+#       XML 1.0 SPEC SECTION 3.1: Start-Tags, End-Tags, Empty-Element Tags     #
+#==============================================================================#
+@testset "Spec 3.1: Start-Tags, End-Tags, Empty-Element Tags" begin
+    @testset "element with attributes (spec example)" begin
+        # <termdef id="dt-dog" term="dog">
+        doc = parse("""<termdef id="dt-dog" term="dog">A dog.</termdef>""", Node)
+        el = doc[1]
+        @test tag(el) == "termdef"
+        @test el["id"] == "dt-dog"
+        @test el["term"] == "dog"
+        @test value(el[1]) == "A dog."
+    end
+
+    @testset "self-closing tag (spec example)" begin
+        # <IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/>
+        doc = parse("""<IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/>""", Node)
+        el = doc[1]
+        @test tag(el) == "IMG"
+        @test el["align"] == "left"
+        @test el["src"] == "http://www.w3.org/Icons/WWW/w3c_home"
+        @test length(children(el)) == 0
+    end
+
+    @testset "simple self-closing tag" begin
+        doc = parse("<br/>", Node)
+        @test tag(doc[1]) == "br"
+        @test length(children(doc[1])) == 0
+    end
+
+    @testset "self-closing tag with space before />" begin
+        doc = parse("<br />", Node)
+        @test tag(doc[1]) == "br"
+    end
+
+    @testset "empty element with start and end tag" begin
+        doc = parse("<empty></empty>", Node)
+        el = doc[1]
+        @test tag(el) == "empty"
+        @test isnothing(el.children)
+    end
+
+    @testset "nested elements" begin
+        doc = parse("<a><b><c/></b></a>", Node)
+        @test tag(doc[1]) == "a"
+        @test tag(doc[1][1]) == "b"
+        @test tag(doc[1][1][1]) == "c"
+    end
+
+    @testset "sibling elements" begin
+        doc = parse("<root><a/><b/><c/></root>", Node)
+        @test length(doc[1]) == 3
+        @test tag(doc[1][1]) == "a"
+        @test tag(doc[1][2]) == "b"
+        @test tag(doc[1][3]) == "c"
+    end
+
+    @testset "attributes with single quotes" begin
+        doc = parse("<x a='val'/>", Node)
+        @test doc[1]["a"] == "val"
+    end
+
+    @testset "attributes with double quotes" begin
+        doc = parse("""<x a="val"/>""", Node)
+        @test doc[1]["a"] == "val"
+    end
+
+    @testset "mixed quote styles in attributes" begin
+        doc = parse("""<x a="1" b='2'/>""", Node)
+        @test doc[1]["a"] == "1"
+        @test doc[1]["b"] == "2"
+    end
+
+    @testset "attribute with > in value" begin
+        doc = parse("""<x a="1>2"/>""", Node)
+        @test doc[1]["a"] == "1>2"
+    end
+
+    @testset "attribute with entity reference" begin
+        doc = parse("""<x a="a&amp;b"/>""", Node)
+        @test doc[1]["a"] == "a&b"
+    end
+
+    @testset "multiple attributes accessible via attributes()" begin
+        doc = parse("""<x first="1" second="2" third="3"/>""", Node)
+        attrs = attributes(doc[1])
+        @test attrs isa Dict
+        @test attrs["first"] == "1"
+        @test attrs["second"] == "2"
+        @test attrs["third"] == "3"
+    end
+
+    @testset "whitespace around = in attributes" begin
+        doc = parse("""<x a = "1" />""", Node)
+        @test doc[1]["a"] == "1"
+    end
+end
+
+#==============================================================================#
+#                  XML 1.0 SPEC SECTION 4.1: Entity References                 #
+#==============================================================================#
+@testset "Spec 4.1: Character and Entity References" begin
+    @testset "predefined entity references in text" begin
+        doc = parse("<root>&lt;</root>", Node)
+        @test simple_value(doc[1]) == "<"
+
+        doc = parse("<root>&gt;</root>", Node)
+        @test simple_value(doc[1]) == ">"
+
+        doc = parse("<root>&amp;</root>", Node)
+        @test simple_value(doc[1]) == "&"
+
+        doc = parse("<root>&apos;</root>", Node)
+        @test simple_value(doc[1]) == "'"
+
+        doc = parse("<root>&quot;</root>", Node)
+        @test simple_value(doc[1]) == "\""
+    end
+
+    @testset "predefined entities in attribute values" begin
+        doc = parse("""<x a="&lt;&gt;&amp;&apos;&quot;"/>""", Node)
+        @test doc[1]["a"] == "<>&'\""
+    end
+
+    @testset "multiple entity references in one text node" begin
+        doc = parse("<root>&lt;tag&gt; &amp; &quot;value&quot;</root>", Node)
+        @test simple_value(doc[1]) == "<tag> & \"value\""
+    end
+end
+
+#==============================================================================#
+#                  NAMESPACES (Colon in Tag and Attribute Names)                #
+#==============================================================================#
+@testset "Namespaces" begin
+    @testset "namespaced element" begin
+        doc = parse("""<ns:root xmlns:ns="http://example.com"><ns:child/></ns:root>""", Node)
+        @test tag(doc[1]) == "ns:root"
+        @test doc[1]["xmlns:ns"] == "http://example.com"
+        @test tag(doc[1][1]) == "ns:child"
+    end
+
+    @testset "default namespace" begin
+        doc = parse("""<root xmlns="http://example.com"/>""", Node)
+        @test doc[1]["xmlns"] == "http://example.com"
+    end
+
+    @testset "multiple namespace prefixes" begin
+        xml = """<root xmlns:a="http://a.com" xmlns:b="http://b.com"><a:x/><b:y/></root>"""
+        doc = parse(xml, Node)
+        @test tag(doc[1][1]) == "a:x"
+        @test tag(doc[1][2]) == "b:y"
+    end
+end
+
+#==============================================================================#
+#                           NODE CONSTRUCTORS                                  #
+#==============================================================================#
+@testset "Node Constructors" begin
+    @testset "Text" begin
+        t = Text("hello")
+        @test nodetype(t) == Text
+        @test value(t) == "hello"
+        @test tag(t) === nothing
+        @test attributes(t) === nothing
+    end
+
+    @testset "Comment" begin
+        c = Comment(" a comment ")
+        @test nodetype(c) == Comment
+        @test value(c) == " a comment "
+    end
+
+    @testset "CData" begin
+        cd = CData("raw <data>")
+        @test nodetype(cd) == CData
+        @test value(cd) == "raw <data>"
+    end
+
+    @testset "DTD" begin
+        d = DTD("html")
+        @test nodetype(d) == DTD
+        @test value(d) == "html"
+    end
+
+    @testset "Declaration" begin
+        decl = Declaration(; version="1.0", encoding="UTF-8")
+        @test nodetype(decl) == Declaration
+        @test decl["version"] == "1.0"
+        @test decl["encoding"] == "UTF-8"
+    end
+
+    @testset "Declaration with no attributes" begin
+        decl = Declaration()
+        @test nodetype(decl) == Declaration
+        @test attributes(decl) === nothing
+    end
+
+    @testset "ProcessingInstruction with content" begin
+        pi = ProcessingInstruction("target", "data here")
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "target"
+        @test value(pi) == "data here"
+    end
+
+    @testset "ProcessingInstruction without content" begin
+        pi = ProcessingInstruction("target")
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "target"
+        @test value(pi) === nothing
+    end
+
+    @testset "Element with tag only" begin
+        el = Element("div")
+        @test nodetype(el) == Element
+        @test tag(el) == "div"
+        @test length(children(el)) == 0
+    end
+
+    @testset "Element with children" begin
+        el = Element("div", Text("hello"), Element("span"))
+        @test length(el) == 2
+        @test nodetype(el[1]) == Text
+        @test nodetype(el[2]) == Element
+    end
+
+    @testset "Element with attributes" begin
+        el = Element("div"; class="main", id="content")
+        @test el["class"] == "main"
+        @test el["id"] == "content"
+    end
+
+    @testset "Element with children and attributes" begin
+        el = Element("a", "click here"; href="http://example.com")
+        @test tag(el) == "a"
+        @test el["href"] == "http://example.com"
+        @test value(el[1]) == "click here"
+    end
+
+    @testset "Element auto-converts non-Node children to Text" begin
+        el = Element("p", 42)
+        @test nodetype(el[1]) == Text
+        @test value(el[1]) == "42"
+    end
+
+    @testset "Document" begin
+        doc = Document(
+            Declaration(; version="1.0"),
+            Element("root")
+        )
+        @test nodetype(doc) == Document
+        @test length(doc) == 2
+        @test nodetype(doc[1]) == Declaration
+        @test nodetype(doc[2]) == Element
+    end
+
+    @testset "Document with all node types" begin
+        doc = Document(
+            Declaration(; version="1.0"),
+            DTD("root"),
+            Comment("comment"),
+            ProcessingInstruction("pi", "data"),
+            Element("root", CData("cdata"), Text("text"))
+        )
+        @test map(nodetype, children(doc)) == [Declaration, DTD, Comment, ProcessingInstruction, Element]
+        @test length(doc[end]) == 2
+        @test nodetype(doc[end][1]) == CData
+        @test value(doc[end][1]) == "cdata"
+        @test nodetype(doc[end][2]) == Text
+        @test value(doc[end][2]) == "text"
+    end
+
+    @testset "invalid constructions" begin
+        @test_throws Exception Text("a", "b")               # too many args
+        @test_throws Exception Comment("a"; x="1")           # no attrs
+        @test_throws Exception CData("a"; x="1")             # no attrs
+        @test_throws Exception DTD("a"; x="1")               # no attrs
+        @test_throws Exception Element()                      # need tag
+        @test_throws Exception Declaration("bad")             # no positional args
+        @test_throws Exception Document(; x="1")              # no attrs
+        @test_throws Exception ProcessingInstruction()        # need target
+        @test_throws Exception ProcessingInstruction("a", "b", "c")  # too many args
+    end
+end
+
+#==============================================================================#
+#                        h CONSTRUCTOR                                         #
+#==============================================================================#
+@testset "h constructor" begin
+    @testset "h(tag)" begin
+        el = h("div")
+        @test nodetype(el) == Element
+        @test tag(el) == "div"
+    end
+
+    @testset "h(tag, children...)" begin
+        el = h("div", "hello")
+        @test simple_value(el) == "hello"
+    end
+
+    @testset "h(tag; attrs...)" begin
+        el = h("div"; class="main")
+        @test el["class"] == "main"
+    end
+
+    @testset "h(tag, children...; attrs...)" begin
+        el = h("div", "hello"; class="main")
+        @test el["class"] == "main"
+        @test value(el[1]) == "hello"
+    end
+
+    @testset "h.tag syntax" begin
+        el = h.div("hello"; class="main")
+        @test tag(el) == "div"
+        @test el["class"] == "main"
+        @test value(el[1]) == "hello"
+    end
+
+    @testset "h.tag with no args" begin
+        el = h.br()
+        @test tag(el) == "br"
+        @test length(children(el)) == 0
+    end
+
+    @testset "h.tag with only attrs" begin
+        el = h.img(; src="image.png")
+        @test tag(el) == "img"
+        @test el["src"] == "image.png"
+    end
+
+    @testset "nested h constructors" begin
+        el = h.div(
+            h.h1("Title"),
+            h.p("Paragraph")
+        )
+        @test tag(el) == "div"
+        @test length(el) == 2
+        @test tag(el[1]) == "h1"
+        @test tag(el[2]) == "p"
+    end
+
+    @testset "h with symbol tag" begin
+        el = h(:div)
+        @test tag(el) == "div"
+    end
+end
+
+#==============================================================================#
+#                        NODE INTERFACE                                        #
+#==============================================================================#
+@testset "Node Interface" begin
+    doc = parse("""<?xml version="1.0"?><root attr="val"><child>text</child></root>""", Node)
+
+    @testset "nodetype" begin
+        @test nodetype(doc) == Document
+        @test nodetype(doc[1]) == Declaration
+        @test nodetype(doc[2]) == Element
+    end
+
+    @testset "tag" begin
+        @test tag(doc) === nothing
+        @test tag(doc[2]) == "root"
+        @test tag(doc[2][1]) == "child"
+    end
+
+    @testset "attributes" begin
+        @test attributes(doc) === nothing
+        @test attributes(doc[2])["attr"] == "val"
+    end
+
+    @testset "value" begin
+        @test value(doc) === nothing
+        @test value(doc[2][1][1]) == "text"
+    end
+
+    @testset "children" begin
+        @test length(children(doc)) == 2
+        @test length(children(doc[2])) == 1
+    end
+
+    @testset "is_simple" begin
+        @test is_simple(doc[2][1]) == true
+        @test is_simple(doc[2]) == false
+    end
+
+    @testset "simple_value" begin
+        @test simple_value(doc[2][1]) == "text"
+        @test_throws ErrorException simple_value(doc[2])
+    end
+
+    @testset "simple_value for CData child" begin
+        el = Element("x", CData("data"))
+        @test is_simple(el)
+        @test simple_value(el) == "data"
+    end
+end
+
+#==============================================================================#
+#                        NODE INDEXING                                          #
+#==============================================================================#
+@testset "Node Indexing" begin
+    doc = parse("<root><a/><b/><c/></root>", Node)
+    root = doc[1]
+
+    @testset "integer indexing" begin
+        @test tag(root[1]) == "a"
+        @test tag(root[2]) == "b"
+        @test tag(root[3]) == "c"
+    end
+
+    @testset "colon indexing" begin
+        all = root[:]
+        @test length(all) == 3
+    end
+
+    @testset "lastindex" begin
+        @test tag(root[end]) == "c"
+    end
+
+    @testset "only" begin
+        single = parse("<root><only/></root>", Node)
+        @test tag(only(single[1])) == "only"
+    end
+
+    @testset "length" begin
+        @test length(root) == 3
+    end
+
+    @testset "attribute indexing" begin
+        el = parse("""<x a="1" b="2"/>""", Node)[1]
+        @test el["a"] == "1"
+        @test el["b"] == "2"
+        @test_throws KeyError el["nonexistent"]
+    end
+
+    @testset "haskey" begin
+        el = parse("""<x a="1"/>""", Node)[1]
+        @test haskey(el, "a") == true
+        @test haskey(el, "b") == false
+    end
+
+    @testset "keys" begin
+        el = parse("""<x a="1" b="2"/>""", Node)[1]
+        @test collect(keys(el)) == ["a", "b"]
+    end
+
+    @testset "keys on element with no attributes" begin
+        el = parse("<x/>", Node)[1]
+        @test isempty(keys(el))
+    end
+end
+
+#==============================================================================#
+#                        NODE MUTATION                                         #
+#==============================================================================#
+@testset "Node Mutation" begin
+    @testset "setindex! child" begin
+        el = Element("root", Element("old"))
+        el[1] = Element("new")
+        @test tag(el[1]) == "new"
+    end
+
+    @testset "setindex! child with auto-conversion" begin
+        el = Element("root", Text("old"))
+        el[1] = "new text"
+        @test value(el[1]) == "new text"
+    end
+
+    @testset "setindex! attribute" begin
+        el = Element("root"; a="1")
+        el["a"] = "2"
+        @test el["a"] == "2"
+    end
+
+    @testset "setindex! new attribute" begin
+        el = Element("root"; a="1")
+        el["b"] = "2"
+        @test el["b"] == "2"
+    end
+
+    @testset "push! child" begin
+        el = Element("root")
+        push!(el, Element("child"))
+        @test length(el) == 1
+        @test tag(el[1]) == "child"
+    end
+
+    @testset "push! with auto-conversion" begin
+        el = Element("root")
+        push!(el, "text")
+        @test nodetype(el[1]) == Text
+        @test value(el[1]) == "text"
+    end
+
+    @testset "pushfirst! child" begin
+        el = Element("root", Element("second"))
+        pushfirst!(el, Element("first"))
+        @test tag(el[1]) == "first"
+        @test tag(el[2]) == "second"
+    end
+
+    @testset "push! on non-container node errors" begin
+        t = Text("hello")
+        @test_throws ErrorException push!(t, "more")
+    end
+end
+
+#==============================================================================#
+#                        NODE EQUALITY                                         #
+#==============================================================================#
+@testset "Node Equality" begin
+    @testset "identical elements are equal" begin
+        a = Element("div", Text("hello"); class="main")
+        b = Element("div", Text("hello"); class="main")
+        @test a == b
+    end
+
+    @testset "different tag names are not equal" begin
+        @test Element("a") != Element("b")
+    end
+
+    @testset "different attributes are not equal" begin
+        @test Element("a"; x="1") != Element("a"; x="2")
+    end
+
+    @testset "different children are not equal" begin
+        @test Element("a", Text("x")) != Element("a", Text("y"))
+    end
+
+    @testset "different node types are not equal" begin
+        @test Text("x") != Comment("x")
+    end
+
+    @testset "empty attributes vs nothing" begin
+        a = Element("a")
+        b = Element("a")
+        @test a == b
+    end
+
+    @testset "parse equality" begin
+        xml = "<root><child>text</child></root>"
+        @test parse(xml, Node) == parse(xml, Node)
+    end
+end
+
+#==============================================================================#
+#                        XML WRITING                                           #
+#==============================================================================#
+@testset "XML Writing" begin
+    @testset "write Text" begin
+        el = Element("p", "hello & goodbye")
+        @test XML.write(el) == "<p>hello &amp; goodbye</p>"
+    end
+
+    @testset "write Element with attributes" begin
+        el = Element("div"; class="main", id="content")
+        s = XML.write(el)
+        @test contains(s, "<div")
+        @test contains(s, "class=\"main\"")
+        @test contains(s, "id=\"content\"")
+        @test contains(s, "/>")
+    end
+
+    @testset "write self-closing element" begin
+        @test XML.write(Element("br")) == "<br/>"
+    end
+
+    @testset "write element with single text child (inline)" begin
+        @test XML.write(Element("p", "hello")) == "<p>hello</p>"
+    end
+
+    @testset "write element with multiple children (indented)" begin
+        el = Element("div", Element("a"), Element("b"))
+        s = XML.write(el)
+        @test contains(s, "<div>")
+        @test contains(s, "  <a/>")
+        @test contains(s, "  <b/>")
+        @test contains(s, "</div>")
+    end
+
+    @testset "write Comment" begin
+        el = Element("root", Comment(" comment "))
+        @test contains(XML.write(el), "<!-- comment -->")
+    end
+
+    @testset "write CData" begin
+        el = Element("root", CData("raw <data>"))
+        @test contains(XML.write(el), "<![CDATA[raw <data>]]>")
+    end
+
+    @testset "write ProcessingInstruction with content" begin
+        pi = ProcessingInstruction("target", "data")
+        @test XML.write(pi) == "<?target data?>"
+    end
+
+    @testset "write ProcessingInstruction without content" begin
+        pi = ProcessingInstruction("target")
+        @test XML.write(pi) == "<?target?>"
+    end
+
+    @testset "write Declaration" begin
+        decl = Declaration(; version="1.0", encoding="UTF-8")
+        s = XML.write(decl)
+        @test contains(s, "<?xml")
+        @test contains(s, "version=\"1.0\"")
+        @test contains(s, "encoding=\"UTF-8\"")
+        @test contains(s, "?>")
+    end
+
+    @testset "write DTD" begin
+        dtd = DTD("html")
+        @test XML.write(dtd) == "<!DOCTYPE html>"
+    end
+
+    @testset "write Document" begin
+        doc = Document(Declaration(; version="1.0"), Element("root"))
+        s = XML.write(doc)
+        @test startswith(s, "<?xml")
+        @test contains(s, "<root/>")
+    end
+
+    @testset "write escapes special characters in text" begin
+        el = Element("p", "a < b & c > d")
+        @test XML.write(el) == "<p>a &lt; b &amp; c &gt; d</p>"
+    end
+
+    @testset "write escapes special characters in attribute values" begin
+        el = Element("x"; a="a\"b")
+        @test contains(XML.write(el), "a=\"a&quot;b\"")
+    end
+
+    @testset "indentsize parameter" begin
+        el = Element("root", Element("child"))
+        s2 = XML.write(el; indentsize=2)
+        s4 = XML.write(el; indentsize=4)
+        @test contains(s2, "  <child/>")
+        @test contains(s4, "    <child/>")
+    end
+
+    @testset "write xml:space='preserve' respects whitespace" begin
+        el = Element("root", Element("p", Text("  hello  "); var"xml:space"="preserve"))
+        s = XML.write(el)
+        @test contains(s, ">  hello  </p>")
+    end
+end
+
+#==============================================================================#
+#                 WRITE TO FILE / READ FROM FILE                               #
+#==============================================================================#
+@testset "File I/O" begin
+    @testset "write and read back" begin
+        doc = Document(
+            Declaration(; version="1.0"),
+            Element("root", Element("child", "text"))
+        )
+        temp = tempname() * ".xml"
+        XML.write(temp, doc)
+        content = read(temp, String)
+        @test contains(content, "<?xml")
+        @test contains(content, "<root>")
+        @test contains(content, "<child>text</child>")
+        doc2 = read(temp, Node)
+        @test nodetype(doc2) == Document
+        # Find the root element
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        child = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test tag(child) == "child"
+        @test simple_value(child) == "text"
+        rm(temp)
+    end
+
+    @testset "read from IO" begin
+        xml = """<?xml version="1.0"?><root>hello</root>"""
+        doc = read(IOBuffer(xml), Node)
+        @test nodetype(doc) == Document
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test simple_value(root) == "hello"
+    end
+end
+
+#==============================================================================#
+#                        PARSE → WRITE → PARSE ROUNDTRIP                       #
+#==============================================================================#
+@testset "Roundtrip: parse → write preserves semantics" begin
+    @testset "declaration and root" begin
+        xml = """<?xml version="1.0"?><root/>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        decls = filter(x -> nodetype(x) == Declaration, children(doc2))
+        @test length(decls) == 1
+        @test decls[1]["version"] == "1.0"
+        els = filter(x -> nodetype(x) == Element, children(doc2))
+        @test length(els) == 1
+        @test tag(els[1]) == "root"
+    end
+
+    @testset "element with attributes and text" begin
+        xml = """<root><child attr="val">text</child></root>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        child = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test tag(child) == "child"
+        @test child["attr"] == "val"
+        text_children = filter(x -> nodetype(x) == Text, children(child))
+        @test any(t -> value(t) == "text", text_children)
+    end
+
+    @testset "all special node types survive roundtrip" begin
+        xml = """<root><!-- comment --><![CDATA[data]]><?pi content?></root>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        types = map(nodetype, filter(x -> nodetype(x) != Text, children(root)))
+        @test Comment in types
+        @test CData in types
+        @test ProcessingInstruction in types
+    end
+
+    @testset "DOCTYPE survives roundtrip" begin
+        xml = """<!DOCTYPE html><html><body/></html>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        dtds = filter(x -> nodetype(x) == DTD, children(doc2))
+        @test length(dtds) == 1
+        @test value(dtds[1]) == "html"
+    end
+
+    @testset "namespace attributes survive roundtrip" begin
+        xml = """<root xmlns:ns="http://example.com"><ns:child/></root>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        @test root["xmlns:ns"] == "http://example.com"
+        child = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test tag(child) == "ns:child"
+    end
+
+    @testset "mixed content survives roundtrip" begin
+        xml = """<p>Hello <b>world</b>!</p>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        non_ws = filter(x -> !(nodetype(x) == Text && isempty(strip(value(x)))), children(root))
+        texts = [value(x) for x in non_ws if nodetype(x) == Text]
+        @test any(t -> contains(t, "Hello"), texts)
+        @test any(t -> contains(t, "!"), texts)
+        bolds = filter(x -> nodetype(x) == Element && tag(x) == "b", non_ws)
+        @test length(bolds) == 1
+        @test simple_value(bolds[1]) == "world"
+    end
+end
+
+@testset "Roundtrip: file-based semantic preservation" begin
+    all_files = filter(isfile, [
+        joinpath(@__DIR__, "data", "xml.xsd"),
+        joinpath(@__DIR__, "data", "kml.xsd"),
+        joinpath(@__DIR__, "data", "books.xml"),
+        # example.kml uses invalid <![CData[...]]> (lowercase), skip it
+        joinpath(@__DIR__, "data", "simple_dtd.xml"),
+    ])
+
     for path in all_files
         node = read(path, Node)
         temp = tempname() * ".xml"
         XML.write(temp, node)
         node2 = read(temp, Node)
-        @test node == node2
+        # Verify structural properties are preserved
+        @test nodetype(node) == nodetype(node2)
+        # Count non-whitespace elements
+        count_elements(n) = sum(1 for c in children(n) if nodetype(c) == Element; init=0)
+        @test count_elements(node) == count_elements(node2)
+        rm(temp)
+    end
+end
+
+#==============================================================================#
+#                       PARSE Node{SubString{String}}                          #
+#==============================================================================#
+@testset "Parse with SubString{String}" begin
+    xml = """<?xml version="1.0"?><root attr="val"><child>text</child></root>"""
+    doc = parse(xml, Node{SubString{String}})
+    @test nodetype(doc) == Document
+    @test tag(doc[2]) == "root"
+    @test doc[2]["attr"] == "val"
+    # SubString values
+    @test value(doc[2][1][1]) isa SubString{String}
+end
+
+#==============================================================================#
+#                       COMPLEX DOCUMENT PARSING                               #
+#==============================================================================#
+@testset "Complex Document Parsing" begin
+    @testset "books.xml" begin
+        path = joinpath(@__DIR__, "data", "books.xml")
+        isfile(path) || return
+        doc = read(path, Node)
+        @test nodetype(doc) == Document
+
+        # Should have declaration + catalog
+        decl_nodes = filter(x -> nodetype(x) == Declaration, children(doc))
+        @test length(decl_nodes) == 1
+        @test decl_nodes[1]["version"] == "1.0"
+
+        el_nodes = filter(x -> nodetype(x) == Element, children(doc))
+        @test length(el_nodes) == 1
+        catalog = el_nodes[1]
+        @test tag(catalog) == "catalog"
+
+        # Catalog has 12 books
+        books = filter(x -> nodetype(x) == Element, children(catalog))
+        @test length(books) == 12
+
+        # First book
+        book1 = books[1]
+        @test book1["id"] == "bk101"
+
+        # Each book has: author, title, genre, price, publish_date, description
+        book_children = filter(x -> nodetype(x) == Element, children(book1))
+        book_tags = map(tag, book_children)
+        @test "author" in book_tags
+        @test "title" in book_tags
+        @test "genre" in book_tags
+        @test "price" in book_tags
+        @test "publish_date" in book_tags
+        @test "description" in book_tags
+
+        author = first(filter(x -> tag(x) == "author", book_children))
+        @test simple_value(author) == "Gambardella, Matthew"
+    end
+
+    @testset "simple_dtd.xml" begin
+        path = joinpath(@__DIR__, "data", "simple_dtd.xml")
+        isfile(path) || return
+        doc = read(path, Node)
+        @test nodetype(doc) == Document
+
+        dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc))
+        @test length(dtd_nodes) == 1
+        @test contains(value(dtd_nodes[1]), "ENTITY")
+    end
+end
+
+#==============================================================================#
+#                        DTD PARSING (parse_dtd)                               #
+#==============================================================================#
+@testset "DTD Parsing (parse_dtd)" begin
+    @testset "simple DTD with entities" begin
+        path = joinpath(@__DIR__, "data", "simple_dtd.xml")
+        isfile(path) || return
+        doc = read(path, Node)
+        dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        pd = parse_dtd(dtd_node)
+        @test pd.root == "note"
+        @test length(pd.entities) == 3
+        @test pd.entities[1].name == "nbsp"
+        @test pd.entities[2].name == "writer"
+        @test pd.entities[3].name == "copyright"
+        @test pd.entities[2].value == "Writer: Donald Duck."
+    end
+
+    @testset "DTD with SYSTEM external ID" begin
+        pd = parse_dtd("""root SYSTEM "root.dtd\"""")
+        @test pd.root == "root"
+        @test pd.system_id == "root.dtd"
+        @test pd.public_id === nothing
+    end
+
+    @testset "DTD with PUBLIC external ID" begin
+        pd = parse_dtd("""root PUBLIC "-//W3C//DTD XHTML 1.0//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"""")
+        @test pd.root == "root"
+        @test pd.public_id == "-//W3C//DTD XHTML 1.0//EN"
+        @test pd.system_id == "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
+    end
+
+    @testset "DTD with ELEMENT declarations" begin
+        pd = parse_dtd("""root [
+<!ELEMENT root (child)>
+<!ELEMENT child (#PCDATA)>
+<!ELEMENT empty EMPTY>
+<!ELEMENT any ANY>
+]""")
+        @test pd.root == "root"
+        @test length(pd.elements) == 4
+        @test pd.elements[1].name == "root"
+        @test pd.elements[1].content == "(child)"
+        @test pd.elements[2].name == "child"
+        @test pd.elements[2].content == "(#PCDATA)"
+        @test pd.elements[3].name == "empty"
+        @test pd.elements[3].content == "EMPTY"
+        @test pd.elements[4].name == "any"
+        @test pd.elements[4].content == "ANY"
+    end
+
+    @testset "DTD with ATTLIST declarations (spec examples)" begin
+        pd = parse_dtd("""root [
+<!ATTLIST termdef id ID #REQUIRED name CDATA #IMPLIED>
+<!ATTLIST list type (bullets|ordered|glossary) "ordered">
+<!ATTLIST form method CDATA #FIXED "POST">
+]""")
+        @test length(pd.attributes) == 4
+        @test pd.attributes[1].element == "termdef"
+        @test pd.attributes[1].name == "id"
+        @test pd.attributes[1].type == "ID"
+        @test pd.attributes[1].default == "#REQUIRED"
+        @test pd.attributes[2].name == "name"
+        @test pd.attributes[2].type == "CDATA"
+        @test pd.attributes[2].default == "#IMPLIED"
+        @test pd.attributes[3].element == "list"
+        @test pd.attributes[3].name == "type"
+        @test pd.attributes[3].default == "\"ordered\""
+        @test pd.attributes[4].element == "form"
+        @test pd.attributes[4].name == "method"
+        @test pd.attributes[4].default == "#FIXED \"POST\""
+    end
+
+    @testset "DTD with ENTITY declarations (spec examples)" begin
+        pd = parse_dtd("""root [
+<!ENTITY Pub-Status "This is a pre-release of the specification.">
+<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">
+<!ENTITY open-hatch2 PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml">
+<!ENTITY % YN '"Yes"'>
+]""")
+        @test length(pd.entities) == 4
+        @test pd.entities[1].name == "Pub-Status"
+        @test pd.entities[1].value == "This is a pre-release of the specification."
+        @test pd.entities[1].parameter == false
+
+        @test pd.entities[2].name == "open-hatch"
+        @test pd.entities[2].value === nothing
+        @test contains(pd.entities[2].external_id, "SYSTEM")
+
+        @test pd.entities[3].name == "open-hatch2"
+        @test contains(pd.entities[3].external_id, "PUBLIC")
+
+        @test pd.entities[4].name == "YN"
+        @test pd.entities[4].parameter == true
+    end
+
+    @testset "DTD with NOTATION declarations (spec example)" begin
+        pd = parse_dtd("""root [
+<!NOTATION vrml PUBLIC "VRML 1.0">
+<!NOTATION jpeg SYSTEM "image/jpeg">
+]""")
+        @test length(pd.notations) == 2
+        @test pd.notations[1].name == "vrml"
+        @test contains(pd.notations[1].external_id, "PUBLIC")
+        @test pd.notations[2].name == "jpeg"
+        @test contains(pd.notations[2].external_id, "SYSTEM")
+    end
+
+    @testset "parse_dtd from Node" begin
+        dtd = DTD("root [<!ELEMENT root (#PCDATA)>]")
+        pd = parse_dtd(dtd)
+        @test pd.root == "root"
+        @test length(pd.elements) == 1
+    end
+
+    @testset "parse_dtd errors on non-DTD node" begin
+        @test_throws ErrorException parse_dtd(Element("x"))
+    end
+
+    @testset "complex DTD file (structure test)" begin
+        # complex_dtd.xml uses parameter entity references (%text;) which parse_dtd
+        # does not expand, so we just verify parsing the XML document itself works
+        path = joinpath(@__DIR__, "data", "complex_dtd.xml")
+        isfile(path) || return
+        doc = read(path, Node)
+        dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test nodetype(dtd_node) == DTD
+        @test contains(value(dtd_node), "test")
+        @test contains(value(dtd_node), "ELEMENT")
+        @test contains(value(dtd_node), "ATTLIST")
+        @test contains(value(dtd_node), "NOTATION")
+        @test contains(value(dtd_node), "ENTITY")
+    end
+end
+
+#==============================================================================#
+#         XML 1.0 SPEC: ELEMENT TYPE DECLARATIONS (Section 3.2)                #
+#==============================================================================#
+@testset "Spec 3.2: Element Type Declarations" begin
+    @testset "EMPTY content model" begin
+        pd = parse_dtd("root [<!ELEMENT br EMPTY>]")
+        @test pd.elements[1].content == "EMPTY"
+    end
+
+    @testset "ANY content model" begin
+        pd = parse_dtd("root [<!ELEMENT container ANY>]")
+        @test pd.elements[1].content == "ANY"
+    end
+
+    @testset "#PCDATA content model" begin
+        pd = parse_dtd("root [<!ELEMENT text (#PCDATA)>]")
+        @test pd.elements[1].content == "(#PCDATA)"
+    end
+
+    @testset "mixed content model" begin
+        pd = parse_dtd("root [<!ELEMENT p (#PCDATA|emph)*>]")
+        @test pd.elements[1].content == "(#PCDATA|emph)*"
+    end
+
+    @testset "sequence content model" begin
+        pd = parse_dtd("root [<!ELEMENT spec (front, body, back?)>]")
+        @test pd.elements[1].content == "(front, body, back?)"
+    end
+
+    @testset "choice content model" begin
+        pd = parse_dtd("root [<!ELEMENT div1 (head, (p | list | note)*, div2*)>]")
+        @test pd.elements[1].content == "(head, (p | list | note)*, div2*)"
+    end
+end
+
+#==============================================================================#
+#       XML 1.0 SPEC: ATTRIBUTE-LIST DECLARATIONS (Section 3.3)                #
+#==============================================================================#
+@testset "Spec 3.3: Attribute-List Declarations" begin
+    @testset "ID attribute" begin
+        pd = parse_dtd("root [<!ATTLIST el id ID #REQUIRED>]")
+        @test pd.attributes[1].type == "ID"
+        @test pd.attributes[1].default == "#REQUIRED"
+    end
+
+    @testset "CDATA attribute with default" begin
+        pd = parse_dtd("""root [<!ATTLIST el name CDATA "default">]""")
+        @test pd.attributes[1].type == "CDATA"
+        @test pd.attributes[1].default == "\"default\""
+    end
+
+    @testset "enumerated attribute" begin
+        pd = parse_dtd("""root [<!ATTLIST list type (bullets|ordered|glossary) "ordered">]""")
+        @test contains(pd.attributes[1].type, "bullets")
+        @test pd.attributes[1].default == "\"ordered\""
+    end
+
+    @testset "#IMPLIED attribute" begin
+        pd = parse_dtd("root [<!ATTLIST el opt CDATA #IMPLIED>]")
+        @test pd.attributes[1].default == "#IMPLIED"
+    end
+
+    @testset "#FIXED attribute" begin
+        pd = parse_dtd("""root [<!ATTLIST el method CDATA #FIXED "POST">]""")
+        @test pd.attributes[1].default == "#FIXED \"POST\""
+    end
+
+    @testset "NOTATION attribute type" begin
+        pd = parse_dtd("root [<!ATTLIST fig notation NOTATION (jpeg|png) #IMPLIED>]")
+        @test contains(pd.attributes[1].type, "NOTATION")
+    end
+
+    @testset "multiple attributes in one ATTLIST" begin
+        pd = parse_dtd("""root [<!ATTLIST book
+  id ID #REQUIRED
+  isbn CDATA #IMPLIED
+  format (hardcover|paperback|ebook) "paperback">]""")
+        @test length(pd.attributes) == 3
+        @test pd.attributes[1].name == "id"
+        @test pd.attributes[2].name == "isbn"
+        @test pd.attributes[3].name == "format"
+    end
+end
+
+#==============================================================================#
+#          XML 1.0 SPEC: ENTITY DECLARATIONS (Section 4.2)                     #
+#==============================================================================#
+@testset "Spec 4.2: Entity Declarations" begin
+    @testset "internal general entity (spec example)" begin
+        pd = parse_dtd("""root [<!ENTITY Pub-Status "This is a pre-release of the specification.">]""")
+        @test pd.entities[1].name == "Pub-Status"
+        @test pd.entities[1].value == "This is a pre-release of the specification."
+        @test pd.entities[1].external_id === nothing
+        @test pd.entities[1].parameter == false
+    end
+
+    @testset "external entity with SYSTEM (spec example)" begin
+        pd = parse_dtd("""root [<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">]""")
+        @test pd.entities[1].name == "open-hatch"
+        @test pd.entities[1].value === nothing
+        @test contains(pd.entities[1].external_id, "SYSTEM")
+        @test contains(pd.entities[1].external_id, "http://www.textuality.com/boilerplate/OpenHatch.xml")
+    end
+
+    @testset "external entity with PUBLIC (spec example)" begin
+        pd = parse_dtd("""root [<!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml">]""")
+        @test pd.entities[1].name == "open-hatch"
+        @test contains(pd.entities[1].external_id, "PUBLIC")
+    end
+
+    @testset "parameter entity" begin
+        pd = parse_dtd("""root [<!ENTITY % YN '"Yes"'>]""")
+        @test pd.entities[1].name == "YN"
+        @test pd.entities[1].parameter == true
+    end
+end
+
+#==============================================================================#
+#         XML 1.0 SPEC: NOTATION DECLARATIONS (Section 4.7)                    #
+#==============================================================================#
+@testset "Spec 4.7: Notation Declarations" begin
+    @testset "NOTATION with PUBLIC (spec example)" begin
+        pd = parse_dtd("""root [<!NOTATION vrml PUBLIC "VRML 1.0">]""")
+        @test pd.notations[1].name == "vrml"
+        @test contains(pd.notations[1].external_id, "PUBLIC")
+        @test contains(pd.notations[1].external_id, "VRML 1.0")
+    end
+
+    @testset "NOTATION with SYSTEM" begin
+        pd = parse_dtd("""root [<!NOTATION jpeg SYSTEM "image/jpeg">]""")
+        @test pd.notations[1].name == "jpeg"
+        @test contains(pd.notations[1].external_id, "SYSTEM")
+    end
+end
+
+#==============================================================================#
+#                        ERROR HANDLING                                        #
+#==============================================================================#
+@testset "Error Handling" begin
+    @testset "mismatched tags" begin
+        @test_throws ErrorException parse("<a></b>", Node)
+    end
+
+    @testset "unclosed tag" begin
+        @test_throws ErrorException parse("<a><b></a>", Node)
+    end
+
+    @testset "closing tag with no open tag" begin
+        @test_throws ErrorException parse("</a>", Node)
+    end
+
+    @testset "unclosed root element" begin
+        @test_throws ErrorException parse("<root>", Node)
+    end
+
+    @testset "unterminated comment" begin
+        @test_throws Exception parse("<root><!-- no end", Node)
+    end
+
+    @testset "unterminated CDATA" begin
+        @test_throws Exception parse("<root><![CDATA[no end", Node)
+    end
+
+    @testset "unterminated PI" begin
+        @test_throws Exception parse("<?pi no end", Node)
+    end
+
+    @testset "unterminated attribute value" begin
+        @test_throws Exception parse("""<a b="no end""", Node)
+    end
+end
+
+#==============================================================================#
+#                     ILL-FORMED XML (must error)                              #
+#==============================================================================#
+@testset "Ill-Formed XML" begin
+    # ---- Tag structure ----
+    @testset "mismatched close tag" begin
+        @test_throws Exception parse("<a></b>", Node)
+    end
+
+    @testset "overlapping elements" begin
+        @test_throws Exception parse("<a><b></a></b>", Node)
+    end
+
+    @testset "deeply mismatched nesting" begin
+        @test_throws Exception parse("<a><b><c></b></c></a>", Node)
+    end
+
+    @testset "multiple unclosed tags" begin
+        @test_throws Exception parse("<a><b><c>", Node)
+    end
+
+    @testset "close tag without open" begin
+        @test_throws Exception parse("</a>", Node)
+    end
+
+    @testset "close tag after self-closing" begin
+        @test_throws Exception parse("<a/></a>", Node)
+    end
+
+    @testset "nested close tag without open" begin
+        @test_throws Exception parse("<root></inner></root>", Node)
+    end
+
+    # ---- Unterminated constructs ----
+    @testset "unterminated open tag at EOF" begin
+        @test_throws Exception parse("<root><unclosed", Node)
+    end
+
+    @testset "unterminated attribute value (double quote)" begin
+        @test_throws Exception parse("""<a x="no end""", Node)
+    end
+
+    @testset "unterminated attribute value (single quote)" begin
+        @test_throws Exception parse("<a x='no end", Node)
+    end
+
+    @testset "unterminated comment" begin
+        @test_throws Exception parse("<!-- no end", Node)
+    end
+
+    @testset "unterminated CDATA" begin
+        @test_throws Exception parse("<![CDATA[no end", Node)
+    end
+
+    @testset "unterminated processing instruction" begin
+        @test_throws Exception parse("<?pi no end", Node)
+    end
+
+    @testset "unterminated DOCTYPE" begin
+        @test_throws Exception parse("<!DOCTYPE x", Node)
+    end
+
+    # ---- Attribute errors ----
+    @testset "duplicate attribute on element" begin
+        @test_throws Exception parse("""<a x="1" x="2"/>""", Node)
+    end
+
+    @testset "duplicate attribute (different values)" begin
+        @test_throws Exception parse("""<root attr="a" attr="b"></root>""", Node)
+    end
+
+    @testset "duplicate attribute in declaration" begin
+        @test_throws Exception parse("""<?xml version="1.0" version="1.1"?><a/>""", Node)
+    end
+
+    @testset "attribute without value" begin
+        @test_throws Exception parse("<a disabled/>", Node)
+    end
+
+    @testset "attribute with unquoted value" begin
+        @test_throws Exception parse("<a x=hello/>", Node)
+    end
+
+    # ---- Tokenizer-level errors ----
+    @testset "lone <" begin
+        @test_throws Exception parse("<", Node)
+    end
+
+    @testset "lone < in text content" begin
+        @test_throws Exception parse("<root>a < b</root>", Node)
+    end
+
+    @testset "tag with space before name" begin
+        @test_throws Exception parse("< root/>", Node)
+    end
+end
+
+#==============================================================================#
+#                        UNICODE SUPPORT                                       #
+#==============================================================================#
+@testset "Unicode Support" begin
+    @testset "Unicode in text content" begin
+        doc = parse("<root>caf\u00e9 \u00f1 \u65e5\u672c\u8a9e</root>", Node)
+        @test simple_value(doc[1]) == "caf\u00e9 \u00f1 \u65e5\u672c\u8a9e"
+    end
+
+    @testset "Unicode in attribute values" begin
+        doc = parse("<root name=\"\u00fcber\"/>", Node)
+        @test doc[1]["name"] == "\u00fcber"
+    end
+
+    @testset "Unicode in comments" begin
+        doc = parse("<root><!-- h\u00e9llo --></root>", Node)
+        @test value(doc[1][1]) == " h\u00e9llo "
+    end
+
+    @testset "CJK characters" begin
+        doc = parse("<root>\u4e2d\u6587</root>", Node)
+        @test simple_value(doc[1]) == "\u4e2d\u6587"
+    end
+
+    @testset "emoji in text" begin
+        doc = parse("<root>\U0001f600\U0001f680</root>", Node)
+        @test simple_value(doc[1]) == "\U0001f600\U0001f680"
+    end
+
+    @testset "Cyrillic characters" begin
+        doc = parse("<root>\u041f\u0440\u0438\u0432\u0435\u0442</root>", Node)
+        @test simple_value(doc[1]) == "\u041f\u0440\u0438\u0432\u0435\u0442"
+    end
+
+    @testset "Arabic characters" begin
+        doc = parse("<root>\u0645\u0631\u062d\u0628\u0627</root>", Node)
+        @test simple_value(doc[1]) == "\u0645\u0631\u062d\u0628\u0627"
+    end
+end
+
+#==============================================================================#
+#                        EDGE CASES                                            #
+#==============================================================================#
+@testset "Edge Cases" begin
+    @testset "document with only whitespace around root" begin
+        doc = parse("  \n  <root/>\n  ", Node)
+        # Parser preserves whitespace as Text nodes
+        els = filter(x -> nodetype(x) == Element, children(doc))
+        @test length(els) == 1
+        @test tag(els[1]) == "root"
+    end
+
+    @testset "deeply nested elements" begin
+        xml = "<a><b><c><d><e><f>deep</f></e></d></c></b></a>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1][1][1][1][1][1]) == "deep"
+    end
+
+    @testset "many siblings" begin
+        items = join(["<item>$i</item>" for i in 1:100])
+        xml = "<root>$items</root>"
+        doc = parse(xml, Node)
+        @test length(doc[1]) == 100
+        @test simple_value(doc[1][1]) == "1"
+        @test simple_value(doc[1][100]) == "100"
+    end
+
+    @testset "element with hyphens and dots in name" begin
+        doc = parse("<my-element.name/>", Node)
+        @test tag(doc[1]) == "my-element.name"
+    end
+
+    @testset "element with underscore in name" begin
+        doc = parse("<_private/>", Node)
+        @test tag(doc[1]) == "_private"
+    end
+
+    @testset "attribute with numeric value" begin
+        doc = parse("""<x count="42"/>""", Node)
+        @test doc[1]["count"] == "42"
+    end
+
+    @testset "empty text content" begin
+        doc = parse("<root></root>", Node)
+        @test isnothing(doc[1].children)
+    end
+
+    @testset "adjacent CDATA and text" begin
+        doc = parse("<root>text<![CDATA[cdata]]>more</root>", Node)
+        @test length(doc[1]) == 3
+        @test value(doc[1][1]) == "text"
+        @test value(doc[1][2]) == "cdata"
+        @test value(doc[1][3]) == "more"
+    end
+
+    @testset "multiple CDATA sections" begin
+        doc = parse("<root><![CDATA[a]]><![CDATA[b]]></root>", Node)
+        @test length(doc[1]) == 2
+        @test value(doc[1][1]) == "a"
+        @test value(doc[1][2]) == "b"
+    end
+
+    @testset "comment between elements" begin
+        doc = parse("<root><a/><!-- between --><b/></root>", Node)
+        @test length(doc[1]) == 3
+        @test nodetype(doc[1][2]) == Comment
+    end
+
+    @testset "PI between elements" begin
+        doc = parse("<root><a/><?pi data?><b/></root>", Node)
+        @test length(doc[1]) == 3
+        @test nodetype(doc[1][2]) == ProcessingInstruction
+    end
+
+    @testset "all node types in one document" begin
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE root SYSTEM "root.dtd">
+<!-- comment -->
+<?pi data?>
+<root>
+  text
+  <child attr="val"/>
+  <!-- inner comment -->
+  <![CDATA[cdata]]>
+  <?inner-pi inner data?>
+</root>"""
+        doc = parse(xml, Node)
+        types = map(nodetype, children(doc))
+        @test Declaration in types
+        @test DTD in types
+        @test Comment in types
+        @test ProcessingInstruction in types
+        @test Element in types
+    end
+
+    @testset "very long attribute value" begin
+        long_val = repeat("a", 10000)
+        doc = parse("""<x attr="$(long_val)"/>""", Node)
+        @test doc[1]["attr"] == long_val
+    end
+
+    @testset "very long text content" begin
+        long_text = repeat("hello ", 10000)
+        doc = parse("<root>$(long_text)</root>", Node)
+        @test simple_value(doc[1]) == long_text
+    end
+
+    @testset "CDATA with ]] but not followed by >" begin
+        doc = parse("<root><![CDATA[a]]b]]></root>", Node)
+        @test value(doc[1][1]) == "a]]b"
+    end
+end
+
+#==============================================================================#
+#                  SPEC EXAMPLES: FULL DOCUMENTS                               #
+#==============================================================================#
+@testset "Full Spec-Like Documents" begin
+    @testset "spec section 2.1: minimal document" begin
+        xml = """<?xml version="1.0"?>
+<greeting>Hello, world!</greeting>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc) == Document
+        @test simple_value(doc[end]) == "Hello, world!"
+    end
+
+    @testset "spec section 2.8: document with external DTD" begin
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE greeting SYSTEM "hello.dtd">
+<greeting>Hello, world!</greeting>"""
+        doc = parse(xml, Node)
+        # Filter out whitespace text nodes to check structure
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        @test length(typed) == 3
+        @test nodetype(typed[1]) == Declaration
+        @test nodetype(typed[2]) == DTD
+        @test nodetype(typed[3]) == Element
+    end
+
+    @testset "spec: document with internal subset" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE greeting [
+  <!ELEMENT greeting (#PCDATA)>
+]>
+<greeting>Hello, world!</greeting>"""
+        doc = parse(xml, Node)
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        @test typed[1]["encoding"] == "UTF-8"
+        @test nodetype(typed[2]) == DTD
+        pd = parse_dtd(typed[2])
+        @test pd.root == "greeting"
+        @test length(pd.elements) == 1
+        @test pd.elements[1].name == "greeting"
+        @test pd.elements[1].content == "(#PCDATA)"
+        @test simple_value(typed[3]) == "Hello, world!"
+    end
 
-        #For debugging:
-        for (a,b) in zip(AbstractTrees.Leaves(node), AbstractTrees.Leaves(node2))
-            if a != b
-                @info path
-                @info a
-                @info b
-                error()
-            end
-        end
+    @testset "typical HTML5-like doctype" begin
+        xml = """<!DOCTYPE html><html><head><title>Test</title></head><body><p>Content</p></body></html>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc[1]) == DTD
+        @test value(doc[1]) == "html"
+        @test tag(doc[2]) == "html"
+    end
+
+    @testset "SVG document" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100">
+  <circle cx="50" cy="50" r="40" fill="red"/>
+  <text x="50" y="50">Hello SVG</text>
+</svg>"""
+        doc = parse(xml, Node)
+        svg = doc[end]
+        @test tag(svg) == "svg"
+        @test svg["xmlns"] == "http://www.w3.org/2000/svg"
+        @test svg["width"] == "100"
+
+        elements = filter(x -> nodetype(x) == Element, children(svg))
+        @test length(elements) == 2
+        @test tag(elements[1]) == "circle"
+        @test elements[1]["fill"] == "red"
+        @test tag(elements[2]) == "text"
+        @test value(elements[2][1]) == "Hello SVG"
+    end
+
+    @testset "SOAP-like envelope" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope">
+  <soap:Header/>
+  <soap:Body>
+    <m:GetPrice xmlns:m="http://www.example.org/stock">
+      <m:StockName>IBM</m:StockName>
+    </m:GetPrice>
+  </soap:Body>
+</soap:Envelope>"""
+        doc = parse(xml, Node)
+        env = doc[end]
+        @test tag(env) == "soap:Envelope"
+        elements = filter(x -> nodetype(x) == Element, children(env))
+        @test tag(elements[1]) == "soap:Header"
+        @test tag(elements[2]) == "soap:Body"
+    end
+
+    @testset "RSS-like feed" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Example Feed</title>
+    <link>http://example.com</link>
+    <description>An example RSS feed</description>
+    <item>
+      <title>Item 1</title>
+      <link>http://example.com/1</link>
+    </item>
+    <item>
+      <title>Item 2</title>
+      <link>http://example.com/2</link>
+    </item>
+  </channel>
+</rss>"""
+        doc = parse(xml, Node)
+        rss = doc[end]
+        @test tag(rss) == "rss"
+        @test rss["version"] == "2.0"
+        channel = first(filter(x -> nodetype(x) == Element, children(rss)))
+        @test tag(channel) == "channel"
+        items = filter(x -> nodetype(x) == Element && tag(x) == "item", children(channel))
+        @test length(items) == 2
+    end
+
+    @testset "Atom-like feed" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Example Feed</title>
+  <entry>
+    <title>Atom-Powered Robots Run Amok</title>
+    <link href="http://example.org/2003/12/13/atom03"/>
+    <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+    <updated>2003-12-13T18:30:02Z</updated>
+    <summary>Some text.</summary>
+  </entry>
+</feed>"""
+        doc = parse(xml, Node)
+        feed = doc[end]
+        @test tag(feed) == "feed"
+        @test feed["xmlns"] == "http://www.w3.org/2005/Atom"
+        entries = filter(x -> nodetype(x) == Element && tag(x) == "entry", children(feed))
+        @test length(entries) == 1
+    end
+
+    @testset "MathML-like document" begin
+        xml = """<math xmlns="http://www.w3.org/1998/Math/MathML">
+  <mrow>
+    <msup>
+      <mi>x</mi>
+      <mn>2</mn>
+    </msup>
+    <mo>+</mo>
+    <mn>1</mn>
+  </mrow>
+</math>"""
+        doc = parse(xml, Node)
+        math = doc[1]
+        @test tag(math) == "math"
+        @test math["xmlns"] == "http://www.w3.org/1998/Math/MathML"
+    end
+
+    @testset "document with processing instructions and comments mixed" begin
+        xml = """<?xml version="1.0"?>
+<!-- This is a comment before the root -->
+<?xml-stylesheet type="text/css" href="style.css"?>
+<root>
+  <!-- inner comment -->
+  <child/>
+  <?pi-inside data?>
+</root>
+<!-- trailing comment -->"""
+        doc = parse(xml, Node)
+        types = map(nodetype, children(doc))
+        @test count(==(Comment), types) == 2
+        @test count(==(ProcessingInstruction), types) >= 1
+        @test count(==(Element), types) == 1
+    end
+end
+
+#==============================================================================#
+#                        SHOW / DISPLAY                                        #
+#==============================================================================#
+@testset "Show (REPL display)" begin
+    @testset "show Text" begin
+        t = Text("hello")
+        s = sprint(show, t)
+        @test contains(s, "Text")
+        @test contains(s, "hello")
+    end
+
+    @testset "show Element" begin
+        el = Element("div"; class="main")
+        s = sprint(show, el)
+        @test contains(s, "Element")
+        @test contains(s, "<div")
+        @test contains(s, "class")
+    end
+
+    @testset "show Comment" begin
+        c = Comment(" test ")
+        s = sprint(show, c)
+        @test contains(s, "Comment")
+        @test contains(s, "<!--")
+    end
+
+    @testset "show CData" begin
+        cd = CData("data")
+        s = sprint(show, cd)
+        @test contains(s, "CData")
+        @test contains(s, "<![CDATA[")
+    end
+
+    @testset "show DTD" begin
+        d = DTD("html")
+        s = sprint(show, d)
+        @test contains(s, "DTD")
+        @test contains(s, "<!DOCTYPE")
+    end
+
+    @testset "show Declaration" begin
+        decl = Declaration(; version="1.0")
+        s = sprint(show, decl)
+        @test contains(s, "Declaration")
+        @test contains(s, "<?xml")
+    end
+
+    @testset "show ProcessingInstruction" begin
+        pi = ProcessingInstruction("target", "data")
+        s = sprint(show, pi)
+        @test contains(s, "ProcessingInstruction")
+        @test contains(s, "<?target")
+    end
+
+    @testset "show Document" begin
+        doc = Document(Element("root"))
+        s = sprint(show, doc)
+        @test contains(s, "Document")
+        @test contains(s, "1 child")
+    end
+
+    @testset "show Element with children count" begin
+        el = Element("div", Element("a"), Element("b"), Element("c"))
+        s = sprint(show, el)
+        @test contains(s, "3 children")
+    end
+
+    @testset "text/xml MIME" begin
+        el = Element("p", "hello")
+        s = sprint(show, MIME("text/xml"), el)
+        @test s == "<p>hello</p>"
     end
 end
 
-#-----------------------------------------------------------------------------# Node writing
-@testset "Node writing" begin
+#==============================================================================#
+#                    SHOW (text/xml MIME) ROUNDTRIP                             #
+#==============================================================================#
+@testset "text/xml MIME output" begin
     doc = Document(
-        DTD("root_tag"),
-        Declaration(version=1.0),
-        Comment("comment"),
-        ProcessingInstruction("xml-stylesheet", href="mystyle.css", type="text/css"),
-        Element("root_tag", CData("cdata"), Text("text"))
+        Declaration(; version="1.0"),
+        Element("root", Element("child", "text"))
     )
-    @test map(nodetype, children(doc)) == [DTD,Declaration,Comment,ProcessingInstruction,Element]
-    @test length(children(doc[end])) == 2
-    @test nodetype(doc[end][1]) == XML.CData
-    @test nodetype(doc[end][2]) == XML.Text
-    @test value(doc[end][1]) == "cdata"
-    @test value(doc[end][2]) == "text"
-
-    #set/get index for attributes
-    o = doc[end]
-    @test isempty(keys(o))
-    o["id"] = 1
-    @test o["id"] == "1"
-    @test keys(o) == keys(Dict("id" => "1"))
-end
-
-#-----------------------------------------------------------------------------# Issues
-@testset "Issues" begin
-    # https://github.com/JuliaComputing/XML.jl/issues/12: DTD content was cut short
-    s = """
-    <!DOCTYPE note [
-    <!ENTITY nbsp "&#xA0;">
-    <!ENTITY writer "Writer: Donald Duck.">
-    <!ENTITY copyright "Copyright: W3Schools.">
-    ]>
-    """
-
-    doc = parse(Node, s)
-    @test value(only(doc)) == s[11:end-2]  # note [...]
-
-    # https://github.com/JuliaComputing/XML.jl/issues/14 (Sorted Attributes)
-    kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z'))
-    xyz  = XML.Element("point"; kw...)
-    @test collect(keys(attributes(xyz))) == string.(collect('a':'z'))
+    xml_str = sprint(show, MIME("text/xml"), doc)
+    @test contains(xml_str, "<?xml")
+    @test contains(xml_str, "<root>")
+    @test contains(xml_str, "<child>text</child>")
+    # Verify it's parseable
+    doc2 = parse(xml_str, Node)
+    @test nodetype(doc2) == Document
+    root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+    @test tag(root) == "root"
+    child = first(filter(x -> nodetype(x) == Element, children(root)))
+    @test simple_value(child) == "text"
+end
+
+#==============================================================================#
+#                    CONSTRUCTION → WRITE → PARSE ROUNDTRIP                    #
+#==============================================================================#
+@testset "Construction → Write → Parse" begin
+    @testset "simple element: write then parse preserves semantics" begin
+        el = Element("greeting", "Hello, world!")
+        xml = XML.write(Document(el))
+        doc2 = parse(xml, Node)
+        @test simple_value(doc2[1]) == "Hello, world!"
+    end
+
+    @testset "element with attributes: write then parse preserves attributes" begin
+        el = Element("item"; id="1", class="active")
+        xml = XML.write(Document(el))
+        doc2 = parse(xml, Node)
+        @test doc2[1]["id"] == "1"
+        @test doc2[1]["class"] == "active"
+    end
+
+    @testset "single-child text elements roundtrip" begin
+        doc = Document(Element("root", "text"))
+        xml = XML.write(doc)
+        doc2 = parse(xml, Node)
+        @test doc == doc2
+    end
+
+    @testset "self-closing elements roundtrip" begin
+        doc = Document(Element("root"))
+        xml = XML.write(doc)
+        doc2 = parse(xml, Node)
+        @test doc == doc2
+    end
+
+    @testset "all node types survive write → parse" begin
+        doc = Document(
+            Declaration(; version="1.0"),
+            Comment(" header "),
+            Element("root",
+                Element("child", "text"),
+                CData("raw <data>"),
+                Comment(" inner "),
+                ProcessingInstruction("pi", "content")
+            )
+        )
+        xml = XML.write(doc)
+        doc2 = parse(xml, Node)
+        typed = filter(x -> nodetype(x) != Text, children(doc2))
+        @test count(==(Declaration), map(nodetype, typed)) == 1
+        @test count(==(Comment), map(nodetype, typed)) == 1
+        @test count(==(Element), map(nodetype, typed)) == 1
+        root = first(filter(x -> nodetype(x) == Element, typed))
+        inner = filter(x -> nodetype(x) != Text, children(root))
+        inner_types = map(nodetype, inner)
+        @test Element in inner_types
+        @test CData in inner_types
+        @test Comment in inner_types
+        @test ProcessingInstruction in inner_types
+    end
+
+    @testset "special characters in text roundtrip" begin
+        el = Element("p", "a < b & c > d ' e \" f")
+        xml = XML.write(Document(el))
+        doc2 = parse(xml, Node)
+        @test simple_value(doc2[1]) == "a < b & c > d ' e \" f"
+    end
+
+    @testset "special characters in attributes roundtrip" begin
+        el = Element("x"; data="a&b<c>d'e\"f")
+        xml = XML.write(Document(el))
+        doc2 = parse(xml, Node)
+        @test doc2[1]["data"] == "a&b<c>d'e\"f"
+    end
+end
+
+#==============================================================================#
+#                        KML-LIKE DOCUMENT                                     #
+#==============================================================================#
+@testset "KML-like Document" begin
+    xml = """<?xml version="1.0" encoding="UTF-8"?>
+<kml xmlns="http://www.opengis.net/kml/2.2">
+  <Document>
+    <name>KML Sample</name>
+    <Placemark>
+      <name>Simple placemark</name>
+      <description>Attached to the ground.</description>
+      <Point>
+        <coordinates>-122.0822035,37.4220033612141,0</coordinates>
+      </Point>
+    </Placemark>
+  </Document>
+</kml>"""
+    doc = parse(xml, Node)
+    kml = doc[end]
+    @test tag(kml) == "kml"
+    @test kml["xmlns"] == "http://www.opengis.net/kml/2.2"
+
+    document = first(filter(x -> nodetype(x) == Element, children(kml)))
+    @test tag(document) == "Document"
+
+    name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(document)))
+    @test simple_value(name) == "KML Sample"
+
+    pm = first(filter(x -> nodetype(x) == Element && tag(x) == "Placemark", children(document)))
+    pm_name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(pm)))
+    @test simple_value(pm_name) == "Simple placemark"
+end
+
+#==============================================================================#
+#                        XHTML-LIKE DOCUMENT                                   #
+#==============================================================================#
+@testset "XHTML-like Document" begin
+    xml = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <title>XHTML Test</title>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+  </head>
+  <body>
+    <h1>Hello World</h1>
+    <p>This is a <strong>test</strong> of XHTML.</p>
+    <br/>
+    <img src="image.png" alt="An image"/>
+  </body>
+</html>"""
+    doc = parse(xml, Node)
+    typed = filter(x -> nodetype(x) != Text, children(doc))
+    @test nodetype(typed[1]) == Declaration
+    @test nodetype(typed[2]) == DTD
+    @test contains(value(typed[2]), "PUBLIC")
+
+    html = first(filter(x -> nodetype(x) == Element, children(doc)))
+    @test tag(html) == "html"
+    @test html["xmlns"] == "http://www.w3.org/1999/xhtml"
+
+    head_el = first(filter(x -> nodetype(x) == Element && tag(x) == "head", children(html)))
+    title_el = first(filter(x -> nodetype(x) == Element && tag(x) == "title", children(head_el)))
+    @test simple_value(title_el) == "XHTML Test"
+
+    body_el = first(filter(x -> nodetype(x) == Element && tag(x) == "body", children(html)))
+    h1_el = first(filter(x -> nodetype(x) == Element && tag(x) == "h1", children(body_el)))
+    @test simple_value(h1_el) == "Hello World"
+
+    # Verify write produces valid XML that can be re-parsed
+    xml2 = XML.write(doc)
+    doc2 = parse(xml2, Node)
+    @test nodetype(doc2) == Document
+end
+
+#==============================================================================#
+#                    PLIST-LIKE DOCUMENT                                        #
+#==============================================================================#
+@testset "plist-like Document" begin
+    xml = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+  <dict>
+    <key>CFBundleName</key>
+    <string>MyApp</string>
+    <key>CFBundleVersion</key>
+    <string>1.0</string>
+    <key>LSRequiresIPhoneOS</key>
+    <true/>
+  </dict>
+</plist>"""
+    doc = parse(xml, Node)
+    plist = doc[end]
+    @test tag(plist) == "plist"
+    @test plist["version"] == "1.0"
+
+    dict = first(filter(x -> nodetype(x) == Element, children(plist)))
+    @test tag(dict) == "dict"
+
+    elements = filter(x -> nodetype(x) == Element, children(dict))
+    keys_found = [simple_value(e) for e in elements if tag(e) == "key"]
+    @test "CFBundleName" in keys_found
+    @test "CFBundleVersion" in keys_found
 end
 
+#==============================================================================#
+#                    MAVEN POM-LIKE DOCUMENT                                   #
+#==============================================================================#
+@testset "Maven POM-like Document" begin
+    xml = """<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>com.example</groupId>
+  <artifactId>my-app</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.13.2</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>"""
+    doc = parse(xml, Node)
+    project = doc[end]
+    @test tag(project) == "project"
+
+    elements = filter(x -> nodetype(x) == Element, children(project))
+    version = first(filter(x -> tag(x) == "version", elements))
+    @test simple_value(version) == "1.0-SNAPSHOT"
+
+    deps = first(filter(x -> tag(x) == "dependencies", elements))
+    dep_list = filter(x -> nodetype(x) == Element, children(deps))
+    @test length(dep_list) == 1
+    @test tag(dep_list[1]) == "dependency"
+end
+
+#==============================================================================#
+#                    GITHUB ISSUES REGRESSION TESTS                            #
+#==============================================================================#
+@testset "GitHub Issues" begin
+
+    #--- Issue #7: attribute order should not affect equality ---
+    @testset "#7: attribute-order-insensitive ==" begin
+        a = Element("x"; first="1", second="2")
+        b = Element("x"; second="2", first="1")
+        @test a == b
+
+        # Same attrs same order still works
+        c = Element("x"; a="1", b="2")
+        d = Element("x"; a="1", b="2")
+        @test c == d
+
+        # Different values are still not equal
+        @test Element("x"; a="1") != Element("x"; a="2")
+
+        # Different attr names are not equal
+        @test Element("x"; a="1") != Element("x"; b="1")
+
+        # Different number of attrs
+        @test Element("x"; a="1") != Element("x"; a="1", b="2")
+
+        # Parsed elements with same attrs in different order
+        doc1 = parse("""<x a="1" b="2"/>""", Node)
+        doc2 = parse("""<x b="2" a="1"/>""", Node)
+        @test doc1[1] == doc2[1]
+
+        # No attrs vs empty attrs (both are "no attributes")
+        @test Element("x") == Element("x")
+    end
+
+    #--- Issue #17: numeric character references ---
+    @testset "#17: numeric character references (&#decimal; and &#xHex;)" begin
+        # Decimal character references
+        @test unescape("&#60;") == "<"
+        @test unescape("&#62;") == ">"
+        @test unescape("&#38;") == "&"
+        @test unescape("&#39;") == "'"
+        @test unescape("&#34;") == "\""
+
+        # Hex character references (lowercase x)
+        @test unescape("&#x3c;") == "<"
+        @test unescape("&#x3C;") == "<"
+        @test unescape("&#x3e;") == ">"
+        @test unescape("&#x26;") == "&"
+        @test unescape("&#x27;") == "'"
+        @test unescape("&#x22;") == "\""
+
+        # Uppercase X also works
+        @test unescape("&#X41;") == "A"
+
+        # Unicode character references
+        @test unescape("&#x41;") == "A"
+        @test unescape("&#65;") == "A"
+        @test unescape("&#x00e9;") == "\u00e9"  # é
+        @test unescape("&#233;") == "\u00e9"     # é
+        @test unescape("&#x4e2d;") == "\u4e2d"   # 中
+        @test unescape("&#x1f600;") == "\U0001f600"  # 😀
+
+        # Mixed with named entities
+        @test unescape("&amp;&#60;&lt;") == "&<<"
+        @test unescape("&#60;tag&#62;") == "<tag>"
+
+        # In parsed XML text
+        doc = parse("<root>&#60;hello&#62;</root>", Node)
+        @test simple_value(doc[1]) == "<hello>"
+
+        # In parsed XML attributes
+        doc = parse("""<x a="&#60;&#62;"/>""", Node)
+        @test doc[1]["a"] == "<>"
+
+        # Non-breaking space
+        @test unescape("&#xA0;") == "\u00a0"
+        @test unescape("&#160;") == "\u00a0"
+
+        # Invalid numeric reference preserved verbatim
+        @test unescape("&#xZZZ;") == "&#xZZZ;"
+
+        # Named entity references that aren't predefined are preserved verbatim
+        @test unescape("&foo;") == "&foo;"
+
+        # Ampersand without semicolon is preserved
+        @test unescape("a & b") == "a & b"
+    end
+
+    #--- Issue #33: empty attributes consistency ---
+    @testset "#33: empty attributes [] vs nothing" begin
+        # Constructed elements have empty Vector for attrs
+        a = Element("x")
+        # Parsed elements with no attrs have nothing
+        b = parse("<x/>", Node)[1]
+        # They should compare equal via _eq / _attrs_eq
+        @test a == b
+    end
+
+    #--- Issue #35: write → parse preserves structure ---
+    @testset "#35: write then parse preserves structure" begin
+        doc = Document(
+            Declaration(; version="1.0"),
+            Element("root",
+                Element("child", "text"),
+                Element("empty")
+            )
+        )
+        xml = XML.write(doc)
+        doc2 = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        child_elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(child_elements) == 2
+        @test tag(child_elements[1]) == "child"
+        @test tag(child_elements[2]) == "empty"
+    end
+
+    #--- Issue #50: Base.get with default ---
+    @testset "#50: Base.get(node, key, default)" begin
+        el = parse("""<x a="1" b="2"/>""", Node)[1]
+
+        # Existing keys return their values
+        @test get(el, "a", "default") == "1"
+        @test get(el, "b", "default") == "2"
+
+        # Non-existing key returns default
+        @test get(el, "c", "default") == "default"
+        @test get(el, "c", nothing) === nothing
+
+        # Works on elements with no attributes
+        el2 = parse("<x/>", Node)[1]
+        @test get(el2, "a", "nope") == "nope"
+
+        # Works on constructed elements
+        el3 = Element("x"; foo="bar")
+        @test get(el3, "foo", "default") == "bar"
+        @test get(el3, "baz", "default") == "default"
+    end
+
+    #--- Issue #52: escape double-escapes (expected behavior) ---
+    @testset "#52: escape is not idempotent (by design)" begin
+        @test escape("&") == "&amp;"
+        @test escape("&amp;") == "&amp;amp;"  # double-escaping is correct
+    end
+
+    #--- Issue #53: unescape works correctly ---
+    @testset "#53: unescape works correctly on parsed content" begin
+        doc = parse("<root>&amp;</root>", Node)
+        @test simple_value(doc[1]) == "&"
+        doc = parse("<root>&lt;tag&gt;</root>", Node)
+        @test simple_value(doc[1]) == "<tag>"
+    end
+end
+
+#==============================================================================#
+#                        TREE NAVIGATION: parent, depth, siblings              #
+#==============================================================================#
+@testset "Tree Navigation" begin
+    doc = parse("<root><a><a1/><a2/></a><b/><c><c1><c1a/></c1></c></root>", Node)
+    root = doc[1]
+    a = root[1]
+    a1 = a[1]
+    a2 = a[2]
+    b = root[2]
+    c = root[3]
+    c1 = c[1]
+    c1a = c1[1]
+
+    @testset "parent" begin
+        @test parent(root, doc) === doc
+        @test parent(a, doc) === root
+        @test parent(a1, doc) === a
+        @test parent(c1a, doc) === c1
+        @test parent(b, root) === root
+        @test_throws ErrorException parent(doc, doc)  # root has no parent
+        @test_throws ErrorException parent(Element("x"), doc)  # not in tree
+    end
+
+    @testset "depth" begin
+        @test depth(doc, doc) == 0
+        @test depth(root, doc) == 1
+        @test depth(a, doc) == 2
+        @test depth(a1, doc) == 3
+        @test depth(c1a, doc) == 4
+        @test depth(b, root) == 1
+        @test_throws ErrorException depth(Element("x"), doc)
+    end
+
+    @testset "siblings" begin
+        @test siblings(a, doc) == [b, c]
+        @test siblings(b, doc) == [a, c]
+        @test siblings(a1, doc) == [a2]
+        @test siblings(a2, doc) == [a1]
+        @test isempty(siblings(c1, doc))
+        @test_throws ErrorException siblings(doc, doc)  # root has no parent
+    end
+
+    @testset "1-arg parent/depth errors" begin
+        @test_throws ErrorException parent(a)
+        @test_throws ErrorException depth(a)
+    end
+end
+
+#==============================================================================#
+#                              XPATH                                           #
+#==============================================================================#
+@testset "XPath" begin
+    doc = parse("""<root>
+        <users>
+            <user id="1" role="admin"><name>Alice</name></user>
+            <user id="2" role="user"><name>Bob</name></user>
+            <user id="3" role="admin"><name>Carol</name></user>
+        </users>
+        <settings><theme>dark</theme></settings>
+    </root>""", Node)
+
+    @testset "absolute path" begin
+        results = xpath(doc, "/root/users/user")
+        @test length(results) == 3
+        @test all(n -> tag(n) == "user", results)
+    end
+
+    @testset "single child" begin
+        results = xpath(doc, "/root/settings/theme")
+        @test length(results) == 1
+        @test tag(results[1]) == "theme"
+    end
+
+    @testset "positional predicate [n]" begin
+        results = xpath(doc, "/root/users/user[1]")
+        @test length(results) == 1
+        @test results[1]["id"] == "1"
+
+        results = xpath(doc, "/root/users/user[3]")
+        @test length(results) == 1
+        @test results[1]["id"] == "3"
+    end
+
+    @testset "[last()]" begin
+        results = xpath(doc, "/root/users/user[last()]")
+        @test length(results) == 1
+        @test results[1]["id"] == "3"
+    end
+
+    @testset "out of bounds predicate" begin
+        results = xpath(doc, "/root/users/user[99]")
+        @test isempty(results)
+    end
+
+    @testset "has-attribute predicate [@attr]" begin
+        results = xpath(doc, "/root/users/user[@role]")
+        @test length(results) == 3
+    end
+
+    @testset "attribute-value predicate [@attr='v']" begin
+        results = xpath(doc, "/root/users/user[@role='admin']")
+        @test length(results) == 2
+        ids = sort([n["id"] for n in results])
+        @test ids == ["1", "3"]
+    end
+
+    @testset "attribute-value with double quotes" begin
+        results = xpath(doc, """/root/users/user[@id="2"]""")
+        @test length(results) == 1
+        @test results[1]["id"] == "2"
+    end
+
+    @testset "descendant //" begin
+        results = xpath(doc, "//name")
+        @test length(results) == 3
+        @test all(n -> tag(n) == "name", results)
+    end
+
+    @testset "// with predicate" begin
+        results = xpath(doc, "//user[@role='admin']/name")
+        @test length(results) == 2
+    end
+
+    @testset "wildcard *" begin
+        results = xpath(doc, "/root/*")
+        @test length(results) == 2
+        @test Set(tag.(results)) == Set(["users", "settings"])
+    end
+
+    @testset "text()" begin
+        results = xpath(doc, "/root/settings/theme/text()")
+        @test length(results) == 1
+        @test value(results[1]) == "dark"
+    end
+
+    @testset "node()" begin
+        results = xpath(doc, "/root/users/user[1]/node()")
+        @test length(results) >= 1
+    end
+
+    @testset "attribute selection @attr" begin
+        results = xpath(doc, "//user/@id")
+        @test length(results) == 3
+        vals = sort([value(n) for n in results])
+        @test vals == ["1", "2", "3"]
+    end
+
+    @testset "self ." begin
+        results = xpath(doc, ".")
+        @test length(results) == 1
+        @test results[1] === doc
+    end
+
+    @testset "no match returns empty" begin
+        @test isempty(xpath(doc, "/root/nonexistent"))
+        @test isempty(xpath(doc, "//nonexistent"))
+    end
+
+    @testset "empty expression" begin
+        @test isempty(xpath(doc, ""))
+    end
+
+    @testset "deep // with path" begin
+        results = xpath(doc, "//theme/text()")
+        @test length(results) == 1
+        @test value(results[1]) == "dark"
+    end
+
+    @testset "error: unterminated predicate" begin
+        @test_throws ErrorException xpath(doc, "/root/user[1")
+    end
+
+    @testset "error: unsupported predicate" begin
+        @test_throws ErrorException xpath(doc, "/root/user[position()>1]")
+    end
+
+    @testset "self-closing elements" begin
+        doc2 = parse("<root><a/><b/><c/></root>", Node)
+        @test length(xpath(doc2, "/root/*")) == 3
+    end
+
+    @testset "relative path" begin
+        root = xpath(doc, "/root")[1]
+        results = xpath(root, "users/user")
+        @test length(results) == 3
+    end
+
+    @testset ".. parent navigation" begin
+        # /root/users/user[1]/.. goes back to <users>
+        results = xpath(doc, "/root/users/user[1]/..")
+        @test length(results) == 1
+        @test tag(results[1]) == "users"
+    end
+
+    @testset ".. in mid-path" begin
+        # /root/users/.. should go back to root
+        results = xpath(doc, "/root/users/..")
+        @test length(results) == 1
+        @test tag(results[1]) == "root"
+    end
+
+    @testset "// mid-path" begin
+        # /root//name finds all <name> elements anywhere under root
+        results = xpath(doc, "/root//name")
+        @test length(results) == 3
+        @test all(n -> tag(n) == "name", results)
+    end
+
+    @testset "// with wildcard //*" begin
+        doc2 = parse("<r><a><b/></a><c/></r>", Node)
+        results = xpath(doc2, "//*")
+        tags = [tag(n) for n in results if nodetype(n) === Element]
+        @test "r" in tags
+        @test "a" in tags
+        @test "b" in tags
+        @test "c" in tags
+    end
+
+    @testset "// with text()" begin
+        results = xpath(doc, "//text()")
+        @test length(results) >= 3  # at least Alice, Bob, Carol
+        vals = [value(n) for n in results]
+        @test "Alice" in vals
+        @test "Bob" in vals
+        @test "dark" in vals
+    end
+
+    @testset "multiple // segments" begin
+        results = xpath(doc, "//users//name")
+        @test length(results) == 3
+        @test all(n -> tag(n) == "name", results)
+    end
+
+    @testset "chained predicates" begin
+        results = xpath(doc, "/root/users/user[@role='admin'][1]")
+        @test length(results) == 1
+        @test results[1]["id"] == "1"
+    end
+
+    @testset "@attr with no match" begin
+        results = xpath(doc, "//user/@nonexistent")
+        @test isempty(results)
+    end
+
+    @testset "namespaced tag" begin
+        doc2 = parse("""<root xmlns:ns="http://example.com"><ns:item>val</ns:item></root>""", Node)
+        results = xpath(doc2, "/root/ns:item")
+        @test length(results) == 1
+        @test tag(results[1]) == "ns:item"
+    end
+
+    @testset "whitespace in expression" begin
+        results = xpath(doc, " / root / users / user ")
+        @test length(results) == 3
+    end
+
+    @testset "error: empty @" begin
+        @test_throws ErrorException xpath(doc, "/root/@")
+    end
+
+    @testset "error: unknown function" begin
+        @test_throws ErrorException xpath(doc, "/root/foo()")
+    end
+
+    @testset "error: unexpected character" begin
+        @test_throws ErrorException xpath(doc, "/root/!bad")
+    end
+
+    @testset "deep nesting" begin
+        doc2 = parse("<a><b><c><d><e>deep</e></d></c></b></a>", Node)
+        results = xpath(doc2, "//e/text()")
+        @test length(results) == 1
+        @test value(results[1]) == "deep"
+    end
+
+    @testset "wildcard with predicate" begin
+        doc2 = parse("""<r><a x="1"/><b x="2"/><c/></r>""", Node)
+        results = xpath(doc2, "/r/*[@x]")
+        @test length(results) == 2
+    end
+
+    @testset "// from non-document node" begin
+        root = xpath(doc, "/root")[1]
+        results = xpath(root, "//name")
+        @test length(results) == 3
+    end
+end
diff --git a/test/test_libxml2_testcases.jl b/test/test_libxml2_testcases.jl
new file mode 100644
index 0000000..0b8a89a
--- /dev/null
+++ b/test/test_libxml2_testcases.jl
@@ -0,0 +1,1578 @@
+# Test cases borrowed from the libxml2 test suite (https://github.com/GNOME/libxml2).
+#
+# libxml2 is Copyright (C) the GNOME Project and contributors, licensed under the MIT License.
+# These test cases are adapted for the XML.jl Julia package.
+#
+# Categories mirror the libxml2 test/ directory structure:
+#   - CDATA handling
+#   - Comments
+#   - Processing instructions
+#   - Attributes (normalization, entities, quoting)
+#   - Namespaces
+#   - DTD / internal subset
+#   - Entity references (character refs, predefined, internal general)
+#   - Whitespace / blank handling
+#   - Well-formedness (boundaries, big names, mixed content)
+#   - Error cases (must fail to parse)
+
+using XML
+using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
+using XML: escape, unescape
+using Test
+
+@testset "libxml2 test cases" begin
+
+#==============================================================================#
+#                            CDATA SECTIONS                                    #
+#   From: test/cdata, test/cdata2, test/adjacent-cdata.xml,                   #
+#         test/emptycdata.xml, test/cdata-*-byte-UTF-8.xml                    #
+#==============================================================================#
+@testset "CDATA" begin
+    @testset "cdata: basic CDATA with markup characters" begin
+        # libxml2 test/cdata
+        xml = """<doc>\n<![CDATA[<greeting>Hello, world!</greeting>]]>\n</doc>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        cdata_nodes = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata_nodes) >= 1
+        @test value(cdata_nodes[1]) == "<greeting>Hello, world!</greeting>"
+    end
+
+    @testset "cdata2: nested CDATA-like content" begin
+        # libxml2 test/cdata2 - tests ]]> escaping pattern
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<collection>
+  <test><![CDATA[
+    <![CDATA[abc]]]>]&gt;<![CDATA[
+  ]]></test>
+</collection>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "collection"
+    end
+
+    @testset "adjacent-cdata: three adjacent CDATA sections" begin
+        # libxml2 test/adjacent-cdata.xml
+        xml = "<doc><![CDATA[abc]]><![CDATA[def]]><![CDATA[ghi]]></doc>"
+        doc = parse(xml, Node)
+        root = doc[1]
+        cdata_nodes = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata_nodes) == 3
+        @test value(cdata_nodes[1]) == "abc"
+        @test value(cdata_nodes[2]) == "def"
+        @test value(cdata_nodes[3]) == "ghi"
+    end
+
+    @testset "emptycdata: empty CDATA section in namespaced doc" begin
+        # libxml2 test/emptycdata.xml
+        xml = """<?xml version="1.0"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+<![CDATA[]]>
+</html>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "html"
+        cdata_nodes = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata_nodes) >= 1
+        @test value(cdata_nodes[1]) == ""
+    end
+
+    @testset "cdata-2-byte-UTF-8: two-byte chars across buffer boundary" begin
+        # libxml2 test/cdata-2-byte-UTF-8.xml - tests Č (U+010C, 2 bytes in UTF-8)
+        long_c = repeat("Č", 400)
+        xml = """<?xml version="1.0" encoding="UTF-8"?>\n<doc>\n<p><![CDATA[$(long_c)]]></p>\n</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        p = first(filter(x -> nodetype(x) == Element, children(root)))
+        cdata = first(filter(x -> nodetype(x) == CData, children(p)))
+        @test value(cdata) == long_c
+    end
+
+    @testset "cdata-3-byte-UTF-8: three-byte chars across buffer boundary" begin
+        # libxml2 test/cdata-3-byte-UTF-8.xml - tests 牛 (U+725B, 3 bytes in UTF-8)
+        long_cow = repeat("牛", 400)
+        xml = """<?xml version="1.0" encoding="UTF-8"?>\n<doc>\n<p><![CDATA[$(long_cow)]]></p>\n</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        p = first(filter(x -> nodetype(x) == Element, children(root)))
+        cdata = first(filter(x -> nodetype(x) == CData, children(p)))
+        @test value(cdata) == long_cow
+    end
+
+    @testset "cdata-4-byte-UTF-8: four-byte chars across buffer boundary" begin
+        # libxml2 test/cdata-4-byte-UTF-8.xml - tests 🍦 (U+1F366, 4 bytes in UTF-8)
+        long_ice = repeat("🍦", 334)
+        xml = """<?xml version="1.0" encoding="UTF-8"?>\n<doc>\n<p><![CDATA[$(long_ice)]]></p>\n</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        p = first(filter(x -> nodetype(x) == Element, children(root)))
+        cdata = first(filter(x -> nodetype(x) == CData, children(p)))
+        @test value(cdata) == long_ice
+    end
+end
+
+#==============================================================================#
+#                              COMMENTS                                        #
+#   From: test/comment.xml through test/comment6.xml, test/badcomment.xml      #
+#==============================================================================#
+@testset "Comments" begin
+    @testset "comment: comments inside element" begin
+        # libxml2 test/comment.xml
+        xml = """<?xml version="1.0"?>
+<doc>
+<!-- document start -->
+<empty/>
+<!-- document end -->
+</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test length(comments) == 2
+        @test contains(value(comments[1]), "document start")
+        @test contains(value(comments[2]), "document end")
+    end
+
+    @testset "comment2: comments outside root element" begin
+        # libxml2 test/comment2.xml
+        xml = """<?xml version="1.0"?>
+<!-- document start -->
+<doc>
+<empty/>
+</doc>
+<!-- document end -->"""
+        doc = parse(xml, Node)
+        top_comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(top_comments) == 2
+        @test contains(value(top_comments[1]), "document start")
+        @test contains(value(top_comments[2]), "document end")
+    end
+
+    @testset "comment3: very long comment (buffer boundary test)" begin
+        # libxml2 test/comment3.xml - 150+ lines of repeated digits
+        lines = join([repeat("01234567890123456789012345678901234567890123456789", 1) for _ in 1:150], "\n")
+        comment_text = " test of very very long comments and buffer limits\n" * lines * "\n"
+        xml = """<?xml version="1.0"?>\n<!--$(comment_text)-->\n<doc/>"""
+        doc = parse(xml, Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) >= 1
+        @test length(value(comments[1])) > 7000
+    end
+
+    @testset "comment5: hyphens and line breaks in comments" begin
+        # libxml2 test/comment5.xml
+        xml = """<?xml version="1.0"?>
+<!-- test of hyphen and line break handling
+     some text - interrupted -
+- - - - - - - - - - - - - - - - - - - - - -
+                      this should stop here^
+
+
+-->
+<doc/>"""
+        doc = parse(xml, Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) == 1
+        @test contains(value(comments[1]), "hyphen")
+        @test contains(value(comments[1]), "- - -")
+    end
+
+    @testset "comment6: comment before DOCTYPE" begin
+        # libxml2 test/comment6.xml
+        xml = """<!--
+long comment long comment long comment long comment long comment long comment
+long comment long comment long comment long comment long comment long comment
+long comment long comment long comment long comment long comment long comment
+-->
+<!DOCTYPE a [
+<!ELEMENT a EMPTY>
+]>
+<a/>"""
+        doc = parse(xml, Node)
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        @test nodetype(typed[1]) == Comment
+        @test nodetype(typed[2]) == DTD
+        @test nodetype(typed[3]) == Element
+    end
+
+    @testset "badcomment: comment with markup-like content" begin
+        # libxml2 test/badcomment.xml - note: libxml2 considers this valid XML
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+
+<foo>
+<!-- def='NT-Char'-->
+</foo>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "foo"
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test length(comments) >= 1
+    end
+
+    @testset "comment4: non-ASCII characters in comments" begin
+        # libxml2 test/comment4.xml (adapted from ISO-8859-1 to UTF-8)
+        xml = """<?xml version="1.0"?>
+<!-- test of non ascii comments like là et très -->
+<!--à another one -->
+<!-- another one à-->
+<doc/>"""
+        doc = parse(xml, Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) == 3
+        @test contains(value(comments[1]), "là")
+        @test contains(value(comments[2]), "à")
+    end
+end
+
+#==============================================================================#
+#                        PROCESSING INSTRUCTIONS                               #
+#   From: test/pi.xml, test/pi2.xml                                           #
+#==============================================================================#
+@testset "Processing Instructions" begin
+    @testset "pi: PIs inside root element" begin
+        # libxml2 test/pi.xml
+        xml = """<?xml version="1.0"?>
+<doc>
+<?document-start doc?>
+<empty/>
+<?document-end doc?>
+</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root))
+        @test length(pis) == 2
+        @test tag(pis[1]) == "document-start"
+        @test value(pis[1]) == "doc"
+        @test tag(pis[2]) == "document-end"
+        @test value(pis[2]) == "doc"
+    end
+
+    @testset "pi2: PIs outside root element" begin
+        # libxml2 test/pi2.xml
+        xml = """<?xml version="1.0"?>
+<?document-start doc?>
+<doc>
+<empty/>
+</doc>
+<?document-end doc?>"""
+        doc = parse(xml, Node)
+        top_pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(top_pis) == 2
+        @test tag(top_pis[1]) == "document-start"
+        @test tag(top_pis[2]) == "document-end"
+    end
+end
+
+#==============================================================================#
+#                            ATTRIBUTES                                        #
+#   From: test/att1 through test/att11, test/attrib.xml,                       #
+#         test/def-xml-attr.xml, test/defattr.xml                              #
+#==============================================================================#
+@testset "Attributes" begin
+    @testset "att1: attribute with newlines (whitespace normalization)" begin
+        # libxml2 test/att1
+        xml = "<doc attr=\"to normalize\nwith a    space\"/>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "doc"
+        @test haskey(doc[1], "attr")
+    end
+
+    @testset "att2: attribute with multiple spaces" begin
+        # libxml2 test/att2
+        xml = """<doc attr="to normalize  with a space"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "to normalize  with a space"
+    end
+
+    @testset "att3: attribute with character references" begin
+        # libxml2 test/att3
+        xml = """<select onclick="aaaa&#10;      bbbb&#160;">f&#160;oo</select>"""
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "select"
+        @test haskey(doc[1], "onclick")
+    end
+
+    @testset "att4: complex document with many attributes" begin
+        # Adapted from libxml2 test/att4 (electroxml document)
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<electroxml modified="20021216T072726">
+  <data from="20021031T22" to="20021130T22">
+    <select>
+      <device serialnumb="E00003562">
+        <par memind="113400" h="3dc1a8de">
+          <val o="0" v="53"/>
+          <val o="e08" v="53"/>
+        </par>
+      </device>
+    </select>
+  </data>
+</electroxml>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "electroxml"
+        @test root["modified"] == "20021216T072726"
+    end
+
+    @testset "attrib: attribute with entities and char refs" begin
+        # libxml2 test/attrib.xml
+        xml = """<item title="Warning: &apos;test&apos;&#160;&#160;" url="http://example.com/" first_time="985034339" visits="1"/>"""
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "item"
+        @test doc[1]["url"] == "http://example.com/"
+        @test doc[1]["visits"] == "1"
+    end
+
+    @testset "att5: attribute with empty value" begin
+        # Adapted from libxml2 test/att5
+        xml = """<?xml version="1.0"?>
+<doc a="" b="val"/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test root["a"] == ""
+        @test root["b"] == "val"
+    end
+
+    @testset "att9: attribute with single quotes in double-quoted value" begin
+        # libxml2 test/att9 pattern
+        xml = """<doc attr="it's a test"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "it's a test"
+    end
+
+    @testset "att10: attribute with double quotes in single-quoted value" begin
+        xml = """<doc attr='he said "hello"'/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "he said \"hello\""
+    end
+
+    @testset "att11: attribute values with entity refs" begin
+        xml = """<doc a="&lt;tag&gt;" b="a&amp;b"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["a"] == "<tag>"
+        @test doc[1]["b"] == "a&b"
+    end
+
+    @testset "def-xml-attr: xml:lang default attribute in DTD" begin
+        # libxml2 test/def-xml-attr.xml (just verify parsing doesn't fail)
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE root [
+  <!ATTLIST foo xml:lang CDATA "eng">
+  <!ATTLIST foo bar CDATA "&lt;&gt;&quot;">
+]>
+<root>
+  <foo/>
+</root>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "root"
+    end
+end
+
+#==============================================================================#
+#                            NAMESPACES                                        #
+#   From: test/ns through test/ns7, test/namespaces/err_*.xml,                #
+#         test/nsclean.xml, test/entity-in-ns-uri.xml                          #
+#==============================================================================#
+@testset "Namespaces" begin
+    @testset "ns: namespace with prefix on element and attribute" begin
+        # libxml2 test/ns
+        xml = """<?xml version="1.0"?>
+<dia:diagram xmlns:dia="http://www.lysator.liu.se/~alla/dia/">
+  <dia:diagramdata dia:testattr="test"/>
+</dia:diagram>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "dia:diagram"
+        @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/"
+        child = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test tag(child) == "dia:diagramdata"
+        @test child["dia:testattr"] == "test"
+    end
+
+    @testset "ns2: namespace on self-closing element" begin
+        # libxml2 test/ns2
+        xml = """<?xml version="1.0"?>
+<dia:diagram xmlns:dia="http://www.lysator.liu.se/~alla/dia/"
+             dia:testattr="test"/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "dia:diagram"
+        @test root["dia:testattr"] == "test"
+    end
+
+    @testset "ns3: xmlns declared after prefixed attribute" begin
+        # libxml2 test/ns3
+        xml = """<?xml version="1.0"?>
+<dia:diagram dia:testattr="test"
+             xmlns:dia="http://www.lysator.liu.se/~alla/dia/"/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test root["dia:testattr"] == "test"
+        @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/"
+    end
+
+    @testset "ns4: xml:lang, xml:link, xml:space built-in attributes" begin
+        # libxml2 test/ns4
+        xml = """<?xml version="1.0"?>
+<diagram testattr="test" xml:lang="en" xml:link="simple" xml:space="preserve"/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test root["xml:lang"] == "en"
+        @test root["xml:space"] == "preserve"
+    end
+
+    @testset "ns5: default namespace on element with prefix on another" begin
+        # libxml2 test/ns5
+        xml = """<element name="foo" xmlns:rng="http://example.org/ns/1" xmlns="http://example.org/ns/1">
+  <empty/>
+</element>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        @test root["xmlns"] == "http://example.org/ns/1"
+        @test root["xmlns:rng"] == "http://example.org/ns/1"
+        @test root["name"] == "foo"
+    end
+
+    @testset "ns6: default namespace on child, not on sibling" begin
+        # libxml2 test/ns6
+        xml = """<root>
+  <foo xmlns="http://abc" />
+  <bar />
+</root>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test tag(elements[1]) == "foo"
+        @test elements[1]["xmlns"] == "http://abc"
+        @test tag(elements[2]) == "bar"
+    end
+
+    @testset "ns7: xml: prefix element (built-in)" begin
+        # libxml2 test/ns7
+        xml = "<xml:test/>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "xml:test"
+    end
+
+    @testset "multiple namespace prefixes" begin
+        xml = """<root xmlns:a="http://a.com" xmlns:b="http://b.com">
+  <a:child a:attr="1"/>
+  <b:child b:attr="2"/>
+</root>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test tag(elements[1]) == "a:child"
+        @test elements[1]["a:attr"] == "1"
+        @test tag(elements[2]) == "b:child"
+        @test elements[2]["b:attr"] == "2"
+    end
+
+    @testset "namespace redeclaration on nested element" begin
+        xml = """<root xmlns:a="http://first.com">
+  <child xmlns:a="http://second.com">
+    <a:leaf/>
+  </child>
+</root>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        child = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test child["xmlns:a"] == "http://second.com"
+    end
+end
+
+#==============================================================================#
+#                    DTD / INTERNAL SUBSET                                     #
+#   From: test/dtd1 through test/dtd13, test/intsubset.xml,                   #
+#         test/intsubset2.xml                                                  #
+#==============================================================================#
+@testset "DTD / Internal Subset" begin
+    @testset "dtd1: DOCTYPE with PUBLIC id" begin
+        # libxml2 test/dtd1
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE MEMO PUBLIC "-//SGMLSOURCE//DTD MEMO//EN"
+                      "http://www.sgmlsource.com/dtds/memo.dtd">
+<MEMO>
+</MEMO>"""
+        doc = parse(xml, Node)
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test contains(value(dtd), "MEMO")
+        @test contains(value(dtd), "PUBLIC")
+    end
+
+    @testset "dtd2: simple internal subset with ELEMENT declaration" begin
+        # libxml2 test/dtd2
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (#PCDATA)>
+]>
+<doc>This is a valid document !</doc>"""
+        doc = parse(xml, Node)
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test contains(value(dtd), "ELEMENT")
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test simple_value(root) == "This is a valid document !"
+    end
+
+    @testset "dtd3: ANY content model" begin
+        # libxml2 test/dtd3
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc ANY>
+]>
+<doc>This is a valid document !</doc>"""
+        doc = parse(xml, Node)
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test contains(value(dtd), "ANY")
+    end
+
+    @testset "dtd4: EMPTY content model" begin
+        # libxml2 test/dtd4
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE doc [
+<!ELEMENT doc EMPTY>]>
+<doc/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+        @test length(children(root)) == 0
+    end
+
+    @testset "dtd5: mixed content model" begin
+        # libxml2 test/dtd5
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (#PCDATA | a | b)*>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+]>
+<doc><a>This</a> is a <b>valid</b> document</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(elements) == 2
+        @test tag(elements[1]) == "a"
+        @test tag(elements[2]) == "b"
+    end
+
+    @testset "dtd6: choice content model" begin
+        # libxml2 test/dtd6
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (a | b)*>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+]>
+<doc><a>This</a><b> is a valid</b><a> document</a></doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(elements) == 3
+    end
+
+    @testset "dtd7: sequence content model" begin
+        # libxml2 test/dtd7
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (a , b)*>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+]>
+<doc><a>This</a><b> is a valid document</b></doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(elements) == 2
+        @test tag(elements[1]) == "a"
+        @test tag(elements[2]) == "b"
+    end
+
+    @testset "dtd8: nested choice and sequence" begin
+        # libxml2 test/dtd8
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc ((a | b) , (c | d))+>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+<!ELEMENT c (#PCDATA)>
+<!ELEMENT d (#PCDATA)>
+]>
+<doc><b>This</b><c> is a valid document</c></doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test tag(elements[1]) == "b"
+        @test tag(elements[2]) == "c"
+    end
+
+    @testset "dtd9: optional content model" begin
+        # libxml2 test/dtd9
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc ((a | b | c) , d)?>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+<!ELEMENT c (#PCDATA)>
+<!ELEMENT d (#PCDATA)>
+]>
+<doc><b>This</b><d> is a valid document</d></doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(elements) == 2
+    end
+
+    @testset "dtd10: mixed repetition content model" begin
+        # libxml2 test/dtd10
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc ((a | b)+ , c ,  d)*>
+<!ELEMENT a (#PCDATA)>
+<!ELEMENT b (#PCDATA)>
+<!ELEMENT c (#PCDATA)>
+<!ELEMENT d (#PCDATA)>
+]>
+<doc><b>This</b><c> is a</c><d> valid document</d></doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        elements = filter(x -> nodetype(x) == Element, children(root))
+        @test length(elements) == 3
+    end
+
+    @testset "dtd11: ATTLIST with CDATA #IMPLIED" begin
+        # libxml2 test/dtd11
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (#PCDATA)>
+<!ATTLIST doc val CDATA #IMPLIED>
+]>
+<doc val="v1"/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test root["val"] == "v1"
+    end
+
+    @testset "dtd12: nested entity references" begin
+        # libxml2 test/dtd12 - entity referencing another entity
+        xml = """<!DOCTYPE doc [
+<!ENTITY YN '"Yes"' >
+<!ENTITY WhatHeSaid "He said &YN;" >
+]>
+<doc>&WhatHeSaid;</doc>"""
+        # This may or may not expand depending on XML.jl's entity handling
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+    end
+
+    @testset "dtd13: comments before and after DOCTYPE" begin
+        # libxml2 test/dtd13
+        xml = """<!-- comment before the DTD -->
+<!DOCTYPE doc [
+<!ELEMENT doc ANY>
+]>
+<!-- comment after the DTD -->
+<doc/>"""
+        doc = parse(xml, Node)
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        @test nodetype(typed[1]) == Comment
+        @test nodetype(typed[2]) == DTD
+        @test nodetype(typed[3]) == Comment
+        @test nodetype(typed[4]) == Element
+    end
+
+    @testset "intsubset: internal subset with comment containing quote" begin
+        # libxml2 test/intsubset.xml
+        xml = """<?xml version="1.0" standalone="yes"?>
+<!DOCTYPE root [
+<!ELEMENT root  EMPTY>
+<!--  " -->
+]>
+<root/>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "root"
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test contains(value(dtd), "ELEMENT")
+    end
+end
+
+#==============================================================================#
+#                        ENTITY REFERENCES                                     #
+#   From: test/ent1 through test/ent11, test/ent6hex                           #
+#==============================================================================#
+@testset "Entity References" begin
+    @testset "ent1: internal general entity declaration and use" begin
+        # libxml2 test/ent1
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE EXAMPLE SYSTEM "example.dtd" [
+<!ENTITY xml "Extensible Markup Language">
+]>
+<EXAMPLE>
+    &xml;
+</EXAMPLE>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "EXAMPLE"
+    end
+
+    @testset "ent3: entity refs in attribute values" begin
+        # libxml2 test/ent3
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE EXAMPLE SYSTEM "example.dtd" [
+<!ENTITY xml "Extensible Markup Language">
+]>
+<EXAMPLE prop1="a&amp;b" prop2="c&lt;d">
+</EXAMPLE>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test root["prop1"] == "a&b"
+        @test root["prop2"] == "c<d"
+    end
+
+    @testset "ent5: numeric character references (decimal and hex)" begin
+        # libxml2 test/ent5
+        xml = """<?xml version="1.0"?>
+<EXAMPLE>
+    This is an inverted exclamation sign &#xA1;
+    This is a space &#32;
+</EXAMPLE>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        text = join([value(c) for c in children(root) if nodetype(c) == Text])
+        @test contains(text, "\u00A1")  # ¡
+        @test contains(text, " ")       # space (&#32;)
+    end
+
+    @testset "ent6: predefined entities with double-escaping" begin
+        # libxml2 test/ent6
+        xml = """<!DOCTYPE doc [
+<!ENTITY lt     "&#38;#60;">
+<!ENTITY gt     "&#62;">
+<!ENTITY amp    "&#38;#38;">
+<!ENTITY apos   "&#39;">
+<!ENTITY quot   "&#34;">
+]>
+<doc a="&lt;">&lt;</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+    end
+
+    @testset "ent8: multiple entities in one document" begin
+        # libxml2 test/ent8
+        xml = """<!DOCTYPE doc [
+<!ENTITY test1 "test 1">
+<!ENTITY test2 "test 2">
+]>
+<doc>
+&test1;&test2;
+</doc>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+    end
+
+    @testset "predefined entities in text content" begin
+        xml = "<doc>&amp; &lt; &gt; &apos; &quot;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "& < > ' \""
+    end
+
+    @testset "predefined entities in attributes" begin
+        xml = """<doc a="&amp;" b="&lt;" c="&gt;" d="&apos;" e="&quot;"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["a"] == "&"
+        @test doc[1]["b"] == "<"
+        @test doc[1]["c"] == ">"
+        @test doc[1]["d"] == "'"
+        @test doc[1]["e"] == "\""
+    end
+
+    @testset "decimal character references" begin
+        xml = "<doc>&#65;&#66;&#67;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "ABC"
+    end
+
+    @testset "hexadecimal character references" begin
+        xml = "<doc>&#x41;&#x42;&#x43;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "ABC"
+    end
+
+    @testset "mixed hex and decimal char refs" begin
+        xml = "<doc>&#x48;&#101;&#x6C;&#108;&#x6F;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "Hello"
+    end
+
+    @testset "char ref for non-ASCII: inverted exclamation" begin
+        xml = "<doc>&#xA1;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "\u00A1"
+    end
+
+    @testset "char ref for CJK character" begin
+        xml = "<doc>&#x4E2D;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "中"
+    end
+
+    @testset "char ref for emoji" begin
+        xml = "<doc>&#x1F600;</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "\U0001F600"
+    end
+end
+
+#==============================================================================#
+#                     WHITESPACE / BLANK HANDLING                              #
+#   From: test/tstblanks.xml, test/title.xml                                  #
+#==============================================================================#
+@testset "Whitespace / Blank Handling" begin
+    @testset "title: simple document with encoding" begin
+        # libxml2 test/title.xml
+        xml = """<?xml version="1.0" encoding="utf-8"?>
+<title>my title</title>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "title"
+        @test simple_value(root) == "my title"
+    end
+
+    @testset "whitespace preservation in text content" begin
+        xml = "<root>  hello  world  </root>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "  hello  world  "
+    end
+
+    @testset "tab and newline preservation" begin
+        xml = "<root>\t\n\ttabbed\n</root>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "\t\n\ttabbed\n"
+    end
+
+    @testset "whitespace-only text node" begin
+        xml = "<root>   </root>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "   "
+    end
+
+    @testset "inter-element whitespace preserved" begin
+        xml = "<root>\n  <a/>\n  <b/>\n</root>"
+        doc = parse(xml, Node)
+        root = doc[1]
+        text_nodes = filter(x -> nodetype(x) == Text, children(root))
+        @test length(text_nodes) >= 1
+    end
+end
+
+#==============================================================================#
+#                    WELL-FORMED DOCUMENTS                                     #
+#   From: test/boundaries1.xml, test/bigname.xml, test/bigname2.xml,          #
+#         test/slashdot.xml, test/eve.xml, test/wap.xml, etc.                 #
+#==============================================================================#
+@testset "Well-Formed Documents" begin
+    @testset "boundaries1: boundary conditions with entities and CDATA" begin
+        # libxml2 test/boundaries1.xml (simplified - without DTD entity expansion)
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE d [
+    <!ENTITY a "]>">
+    <!ENTITY b ']>'>
+]>
+<?pi p1?>
+<d a=">" b='>'>
+text
+<![CDATA[cdata]]>
+<?pi p2?>
+</d>
+<?pi p3?>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "d"
+        @test root["a"] == ">"
+        @test root["b"] == ">"
+        cdata_nodes = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata_nodes) == 1
+        @test value(cdata_nodes[1]) == "cdata"
+    end
+
+    @testset "bigname: very long element name" begin
+        # libxml2 test/bigname.xml - element name with >10000 characters
+        longname = "this_is_a_very_large_name_" * repeat("0123456789", 500) * "_end"
+        xml = "<$(longname)/>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == longname
+    end
+
+    @testset "slashdot: real-world XML (ultramode feed)" begin
+        # libxml2 test/slashdot.xml (simplified)
+        xml = """<ultramode>
+ <story>
+    <title>100 Mbit/s on Fibre to the home</title>
+    <url>http://slashdot.org/articles/99/06/06/1440211.shtml</url>
+    <time>1999-06-06 14:39:59</time>
+    <author>CmdrTaco</author>
+    <department>wouldn't-it-be-nice</department>
+    <topic>internet</topic>
+    <comments>20</comments>
+    <section>articles</section>
+    <image>topicinternet.jpg</image>
+  </story>
+ <story>
+    <title>Gimp 1.2 Preview</title>
+    <url>http://slashdot.org/articles/99/06/06/1438246.shtml</url>
+    <time>1999-06-06 14:38:40</time>
+    <author>CmdrTaco</author>
+    <department>stuff-to-read</department>
+    <topic>gimp</topic>
+    <comments>12</comments>
+    <section>articles</section>
+    <image>topicgimp.gif</image>
+  </story>
+</ultramode>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        @test tag(root) == "ultramode"
+        stories = filter(x -> nodetype(x) == Element && tag(x) == "story", children(root))
+        @test length(stories) == 2
+        title1 = first(filter(x -> nodetype(x) == Element && tag(x) == "title",
+                              children(stories[1])))
+        @test simple_value(title1) == "100 Mbit/s on Fibre to the home"
+    end
+
+    @testset "eve: document with external DTD reference and internal entity" begin
+        # libxml2 test/eve.xml
+        xml = """<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE spec PUBLIC "-//testspec//" "dtds/eve.dtd" [
+<!ENTITY iso6.doc.date '29-May-1999'>
+]>
+<spec>
+</spec>"""
+        doc = parse(xml, Node)
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc)))
+        @test contains(value(dtd), "PUBLIC")
+        @test contains(value(dtd), "ENTITY")
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "spec"
+    end
+
+    @testset "deeply nested document" begin
+        xml = "<a><b><c><d><e><f><g><h><i><j>deep</j></i></h></g></f></e></d></c></b></a>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1][1][1][1][1][1][1][1][1][1]) == "deep"
+    end
+
+    @testset "many sibling elements" begin
+        items = join(["<item n=\"$i\">Item $i</item>" for i in 1:200])
+        xml = "<root>$items</root>"
+        doc = parse(xml, Node)
+        elements = filter(x -> nodetype(x) == Element, children(doc[1]))
+        @test length(elements) == 200
+        @test elements[1]["n"] == "1"
+        @test elements[200]["n"] == "200"
+    end
+
+    @testset "mixed content: text, elements, CDATA, comments, PIs" begin
+        xml = """<doc>
+  text before
+  <child attr="v">child text</child>
+  <!-- a comment -->
+  <![CDATA[cdata content]]>
+  <?pi data?>
+  text after
+</doc>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        types = Set(nodetype(c) for c in children(root))
+        @test Text in types
+        @test Element in types
+        @test Comment in types
+        @test CData in types
+        @test ProcessingInstruction in types
+    end
+
+    @testset "self-closing elements" begin
+        xml = "<root><br/><hr /><img  /></root>"
+        doc = parse(xml, Node)
+        elements = filter(x -> nodetype(x) == Element, children(doc[1]))
+        @test length(elements) == 3
+        @test tag(elements[1]) == "br"
+        @test tag(elements[2]) == "hr"
+        @test tag(elements[3]) == "img"
+        @test all(x -> length(children(x)) == 0, elements)
+    end
+
+    @testset "empty element: start-tag and end-tag" begin
+        xml = "<root><empty></empty></root>"
+        doc = parse(xml, Node)
+        el = first(filter(x -> nodetype(x) == Element, children(doc[1])))
+        @test tag(el) == "empty"
+    end
+
+    @testset "element names with hyphens, dots, underscores" begin
+        xml = "<my-root><sub.element/><_private/></my-root>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "my-root"
+        elements = filter(x -> nodetype(x) == Element, children(doc[1]))
+        @test tag(elements[1]) == "sub.element"
+        @test tag(elements[2]) == "_private"
+    end
+
+    @testset "element names starting with underscore" begin
+        xml = "<_root><__child/></_root>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "_root"
+    end
+
+    @testset "numeric element names (with letter prefix)" begin
+        xml = "<h1>heading</h1>"
+        doc = parse(xml, Node)
+        @test tag(doc[1]) == "h1"
+        @test simple_value(doc[1]) == "heading"
+    end
+end
+
+#==============================================================================#
+#                    ROUNDTRIP: PARSE → WRITE → PARSE                          #
+#   Tests that libxml2-style documents survive roundtrip processing            #
+#==============================================================================#
+@testset "Roundtrip" begin
+    @testset "roundtrip: namespaced document" begin
+        xml = """<?xml version="1.0"?>
+<dia:diagram xmlns:dia="http://www.lysator.liu.se/~alla/dia/">
+  <dia:diagramdata dia:testattr="test"/>
+</dia:diagram>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/"
+    end
+
+    @testset "roundtrip: DTD with internal subset" begin
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (#PCDATA)>
+]>
+<doc>text</doc>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        dtd = first(filter(x -> nodetype(x) == DTD, children(doc2)))
+        @test contains(value(dtd), "ELEMENT")
+    end
+
+    @testset "roundtrip: adjacent CDATA sections" begin
+        xml = "<doc><![CDATA[abc]]><![CDATA[def]]></doc>"
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        cdata_nodes = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata_nodes) == 2
+    end
+
+    @testset "roundtrip: processing instructions" begin
+        xml = """<?xml version="1.0"?>
+<?document-start doc?>
+<doc/>
+<?document-end doc?>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc2))
+        @test length(pis) == 2
+    end
+
+    @testset "roundtrip: comments with special characters" begin
+        xml = "<root><!-- special: <>&'\" --></root>"
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test length(comments) == 1
+    end
+
+    @testset "roundtrip: entities in attributes" begin
+        xml = """<doc a="a&amp;b" b="c&lt;d"/>"""
+        doc = parse(xml, Node)
+        s = XML.write(doc)
+        doc2 = parse(s, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc2)))
+        @test root["a"] == "a&b"
+        @test root["b"] == "c<d"
+    end
+end
+
+#==============================================================================#
+#                    ERROR CASES (must fail to parse)                           #
+#   From: test/errors/*, test/namespaces/err_*.xml                             #
+#==============================================================================#
+@testset "Error Cases" begin
+    @testset "errors/empty: empty document" begin
+        # libxml2 test/errors/empty.xml
+        # XML.jl is lenient: returns an empty Document for empty input
+        doc = parse("", Node)
+        @test nodetype(doc) == Document
+        @test length(children(doc)) == 0
+    end
+
+    @testset "errors/extra-content: content after root element" begin
+        # libxml2 test/errors/extra-content.xml
+        # XML.jl is lenient: treats trailing text as a Text node in the Document
+        doc = parse("<d/>x", Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "errors/invalid-start-tag-1: text-only document" begin
+        # libxml2 test/errors/invalid-start-tag-1.xml
+        # XML.jl is lenient: treats bare text as a Text node
+        doc = parse("x", Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "errors/invalid-start-tag-2: lone <" begin
+        # libxml2 test/errors/invalid-start-tag-2.xml
+        @test_throws Exception parse("<", Node)
+    end
+
+    @testset "errors/doctype1: malformed DOCTYPE" begin
+        # libxml2 test/errors/doctype1.xml - "<!DOCTYPE doc>[]>"
+        # XML.jl is lenient: parses the DOCTYPE and treats []> as text
+        doc = parse("<!DOCTYPE doc>[]>\n<doc/>", Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "errors/dup-xml-attr: duplicate xml: attribute" begin
+        # libxml2 test/errors/dup-xml-attr.xml
+        @test_throws Exception parse("""<doc xml:lang="en" xml:lang="de"/>""", Node)
+    end
+
+    @testset "errors/attr5: duplicate attribute" begin
+        # libxml2 test/errors/attr5.xml
+        @test_throws Exception parse("""<d xmlns="urn:foo">
+    <a b="" b=""/>
+</d>""", Node)
+    end
+
+    @testset "mismatched tags" begin
+        @test_throws Exception parse("<a></b>", Node)
+    end
+
+    @testset "overlapping elements" begin
+        @test_throws Exception parse("<a><b></a></b>", Node)
+    end
+
+    @testset "unclosed root element" begin
+        @test_throws Exception parse("<root>", Node)
+    end
+
+    @testset "close tag without open" begin
+        @test_throws Exception parse("</a>", Node)
+    end
+
+    @testset "unclosed comment" begin
+        @test_throws Exception parse("<!-- no end", Node)
+    end
+
+    @testset "unclosed CDATA" begin
+        @test_throws Exception parse("<![CDATA[no end", Node)
+    end
+
+    @testset "unclosed PI" begin
+        @test_throws Exception parse("<?pi no end", Node)
+    end
+
+    @testset "unterminated attribute (double quote)" begin
+        @test_throws Exception parse("""<a x="no end""", Node)
+    end
+
+    @testset "unterminated attribute (single quote)" begin
+        @test_throws Exception parse("<a x='no end", Node)
+    end
+
+    @testset "duplicate attribute" begin
+        @test_throws Exception parse("""<a x="1" x="2"/>""", Node)
+    end
+
+    @testset "attribute without value" begin
+        @test_throws Exception parse("<a disabled/>", Node)
+    end
+
+    @testset "attribute with unquoted value" begin
+        @test_throws Exception parse("<a x=hello/>", Node)
+    end
+
+    @testset "tag with space before name" begin
+        @test_throws Exception parse("< root/>", Node)
+    end
+
+    @testset "lone < in text content" begin
+        @test_throws Exception parse("<root>a < b</root>", Node)
+    end
+
+    @testset "close tag after self-closing" begin
+        @test_throws Exception parse("<a/></a>", Node)
+    end
+
+    @testset "deeply mismatched nesting" begin
+        @test_throws Exception parse("<a><b><c></b></c></a>", Node)
+    end
+
+    @testset "multiple unclosed tags" begin
+        @test_throws Exception parse("<a><b><c>", Node)
+    end
+end
+
+#==============================================================================#
+#                    UNICODE SUPPORT                                            #
+#   Tests borrowed from libxml2's UTF-8 handling tests                         #
+#==============================================================================#
+@testset "Unicode" begin
+    @testset "Latin-1 characters" begin
+        xml = "<doc>café résumé naïve</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "café résumé naïve"
+    end
+
+    @testset "CJK characters" begin
+        xml = "<doc>中文日本語한국어</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "中文日本語한국어"
+    end
+
+    @testset "Cyrillic characters" begin
+        xml = "<doc>Привет мир</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "Привет мир"
+    end
+
+    @testset "Arabic characters" begin
+        xml = "<doc>مرحبا</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "مرحبا"
+    end
+
+    @testset "Emoji (4-byte UTF-8)" begin
+        xml = "<doc>🍦🎉🚀</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == "🍦🎉🚀"
+    end
+
+    @testset "Unicode in attribute values" begin
+        xml = """<doc name="über" city="東京"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["name"] == "über"
+        @test doc[1]["city"] == "東京"
+    end
+
+    @testset "Unicode in comments" begin
+        xml = "<doc><!-- héllo wörld --></doc>"
+        doc = parse(xml, Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc[1]))
+        @test contains(value(comments[1]), "héllo")
+    end
+
+    @testset "Unicode in CDATA" begin
+        xml = "<doc><![CDATA[日本語テスト]]></doc>"
+        doc = parse(xml, Node)
+        cdata = first(filter(x -> nodetype(x) == CData, children(doc[1])))
+        @test value(cdata) == "日本語テスト"
+    end
+
+    @testset "Unicode in PI content" begin
+        xml = "<doc><?mypi données à traiter?></doc>"
+        doc = parse(xml, Node)
+        pi = first(filter(x -> nodetype(x) == ProcessingInstruction, children(doc[1])))
+        @test contains(value(pi), "données")
+    end
+
+    @testset "UTF-8 BOM handling" begin
+        # libxml2 test/utf8bom.xml pattern
+        xml = "\xef\xbb\xbf<?xml version=\"1.0\"?>\n<doc/>"
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "doc"
+    end
+end
+
+#==============================================================================#
+#                REAL-WORLD DOCUMENT PATTERNS                                  #
+#   Patterns commonly tested by libxml2 (DAV, RDF, SOAP, SVG, etc.)           #
+#==============================================================================#
+@testset "Real-World Document Patterns" begin
+    @testset "WebDAV-like document" begin
+        # Inspired by libxml2 test/dav* series
+        xml = """<?xml version="1.0" encoding="utf-8" ?>
+<D:multistatus xmlns:D="DAV:">
+  <D:response>
+    <D:href>/container/</D:href>
+    <D:propstat>
+      <D:prop>
+        <D:displayname>Example collection</D:displayname>
+        <D:resourcetype><D:collection/></D:resourcetype>
+      </D:prop>
+      <D:status>HTTP/1.1 200 OK</D:status>
+    </D:propstat>
+  </D:response>
+</D:multistatus>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "D:multistatus"
+        @test root["xmlns:D"] == "DAV:"
+    end
+
+    @testset "RDF-like document" begin
+        # Inspired by libxml2 test/rdf1, test/rdf2
+        xml = """<?xml version="1.0"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <rdf:Description rdf:about="http://example.org/resource">
+    <dc:title>Example Resource</dc:title>
+    <dc:creator>John Doe</dc:creator>
+  </rdf:Description>
+</rdf:RDF>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "rdf:RDF"
+        desc = first(filter(x -> nodetype(x) == Element, children(root)))
+        @test desc["rdf:about"] == "http://example.org/resource"
+    end
+
+    @testset "SVG-like document" begin
+        # Inspired by libxml2 test/svg1, test/svg2, test/svg3
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
+     width="200" height="200" viewBox="0 0 200 200">
+  <defs>
+    <linearGradient id="grad1" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" style="stop-color:rgb(255,255,0);stop-opacity:1"/>
+      <stop offset="100%" style="stop-color:rgb(255,0,0);stop-opacity:1"/>
+    </linearGradient>
+  </defs>
+  <rect x="10" y="10" width="180" height="180" fill="url(#grad1)"/>
+  <circle cx="100" cy="100" r="50" fill="blue" opacity="0.5"/>
+  <text x="100" y="100" text-anchor="middle">Hello SVG</text>
+</svg>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "svg"
+        @test root["xmlns"] == "http://www.w3.org/2000/svg"
+        @test root["width"] == "200"
+    end
+
+    @testset "SOAP-like envelope" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
+               xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+               xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+  <soap:Body>
+    <GetWeather xmlns="http://www.example.com/weather">
+      <City>New York</City>
+      <Country>US</Country>
+    </GetWeather>
+  </soap:Body>
+</soap:Envelope>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "soap:Envelope"
+    end
+
+    @testset "Atom feed" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Example Feed</title>
+  <link href="http://example.org/"/>
+  <updated>2003-12-13T18:30:02Z</updated>
+  <author>
+    <name>John Doe</name>
+  </author>
+  <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
+  <entry>
+    <title>Atom-Powered Robots Run Amok</title>
+    <link href="http://example.org/2003/12/13/atom03"/>
+    <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+    <updated>2003-12-13T18:30:02Z</updated>
+    <summary>Some text.</summary>
+  </entry>
+</feed>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "feed"
+        @test root["xmlns"] == "http://www.w3.org/2005/Atom"
+    end
+
+    @testset "plist-like document" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+  <dict>
+    <key>Name</key>
+    <string>Example</string>
+    <key>Version</key>
+    <integer>42</integer>
+    <key>Enabled</key>
+    <true/>
+    <key>Tags</key>
+    <array>
+      <string>alpha</string>
+      <string>beta</string>
+    </array>
+  </dict>
+</plist>"""
+        doc = parse(xml, Node)
+        plist = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(plist) == "plist"
+        @test plist["version"] == "1.0"
+    end
+
+    @testset "XHTML with mixed content" begin
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head><title>Test</title></head>
+  <body>
+    <p>This is <em>emphasized</em> and <strong>strong</strong> text.</p>
+    <p>A link: <a href="http://example.com">click here</a>.</p>
+    <hr/>
+    <pre>  preformatted  text  </pre>
+  </body>
+</html>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "html"
+        @test root["xmlns"] == "http://www.w3.org/1999/xhtml"
+    end
+
+    @testset "MathML-like document" begin
+        xml = """<math xmlns="http://www.w3.org/1998/Math/MathML">
+  <mrow>
+    <msup><mi>x</mi><mn>2</mn></msup>
+    <mo>+</mo>
+    <msup><mi>y</mi><mn>2</mn></msup>
+    <mo>=</mo>
+    <msup><mi>z</mi><mn>2</mn></msup>
+  </mrow>
+</math>"""
+        doc = parse(xml, Node)
+        root = doc[1]
+        @test tag(root) == "math"
+        @test root["xmlns"] == "http://www.w3.org/1998/Math/MathML"
+    end
+
+    @testset "WML-like document (mobile)" begin
+        # Inspired by libxml2 test/wml.xml
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE wml PUBLIC "-//WAPFORUM//DTD WML 1.1//EN" "http://www.wapforum.org/DTD/wml_1.1.xml">
+<wml>
+  <card id="main" title="Main Menu">
+    <p>Welcome to WML</p>
+  </card>
+</wml>"""
+        doc = parse(xml, Node)
+        root = first(filter(x -> nodetype(x) == Element, children(doc)))
+        @test tag(root) == "wml"
+    end
+end
+
+#==============================================================================#
+#                    EDGE CASES                                                #
+#   Additional edge cases inspired by libxml2 test patterns                    #
+#==============================================================================#
+@testset "Edge Cases" begin
+    @testset "CDATA containing ]] not followed by >" begin
+        xml = "<root><![CDATA[a]]b]]></root>"
+        doc = parse(xml, Node)
+        cdata = first(filter(x -> nodetype(x) == CData, children(doc[1])))
+        @test value(cdata) == "a]]b"
+    end
+
+    @testset "comment containing --" begin
+        # Note: -- inside comments is technically not well-formed per spec,
+        # but many parsers tolerate single - characters
+        xml = "<root><!-- one-dash and hyphen-ated --></root>"
+        doc = parse(xml, Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc[1]))
+        @test length(comments) == 1
+    end
+
+    @testset "attribute value containing >" begin
+        xml = """<doc attr="a>b"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "a>b"
+    end
+
+    @testset "attribute value containing single quote in double quotes" begin
+        xml = """<doc attr="it's"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "it's"
+    end
+
+    @testset "attribute value containing double quote in single quotes" begin
+        xml = "<doc attr='say \"hello\"'/>"
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == "say \"hello\""
+    end
+
+    @testset "very long attribute value" begin
+        long_val = repeat("x", 10000)
+        xml = """<doc attr="$(long_val)"/>"""
+        doc = parse(xml, Node)
+        @test doc[1]["attr"] == long_val
+    end
+
+    @testset "very long text content" begin
+        long_text = repeat("word ", 5000)
+        xml = "<doc>$(long_text)</doc>"
+        doc = parse(xml, Node)
+        @test simple_value(doc[1]) == long_text
+    end
+
+    @testset "many attributes on one element" begin
+        attrs = join(["a$i=\"v$i\"" for i in 1:50], " ")
+        xml = "<doc $attrs/>"
+        doc = parse(xml, Node)
+        @test doc[1]["a1"] == "v1"
+        @test doc[1]["a50"] == "v50"
+    end
+
+    @testset "whitespace around = in attributes" begin
+        xml = """<doc a = "1" b  =  "2" />"""
+        doc = parse(xml, Node)
+        @test doc[1]["a"] == "1"
+        @test doc[1]["b"] == "2"
+    end
+
+    @testset "tab and newline in tag whitespace" begin
+        xml = "<doc\n\ta=\"1\"\n\tb=\"2\"\n/>"
+        doc = parse(xml, Node)
+        @test doc[1]["a"] == "1"
+        @test doc[1]["b"] == "2"
+    end
+
+    @testset "empty element: self-closing vs open-close" begin
+        xml1 = "<root><x/></root>"
+        xml2 = "<root><x></x></root>"
+        doc1 = parse(xml1, Node)
+        doc2 = parse(xml2, Node)
+        # Both should produce empty elements
+        el1 = first(filter(x -> nodetype(x) == Element, children(doc1[1])))
+        el2 = first(filter(x -> nodetype(x) == Element, children(doc2[1])))
+        @test tag(el1) == tag(el2) == "x"
+    end
+
+    @testset "document with all prolog components" begin
+        xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<!DOCTYPE root [
+  <!ELEMENT root (#PCDATA | child)*>
+  <!ELEMENT child EMPTY>
+  <!ATTLIST child id ID #REQUIRED>
+  <!ENTITY greeting "Hello, World!">
+]>
+<!-- document comment -->
+<?app-instruction data?>
+<root>&greeting;<child id="c1"/></root>"""
+        doc = parse(xml, Node)
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        type_list = map(nodetype, typed)
+        @test Declaration in type_list
+        @test DTD in type_list
+        @test Comment in type_list
+        @test ProcessingInstruction in type_list
+        @test Element in type_list
+    end
+end
+
+end  # top-level @testset
diff --git a/test/test_remote_files.jl b/test/test_remote_files.jl
new file mode 100644
index 0000000..21af565
--- /dev/null
+++ b/test/test_remote_files.jl
@@ -0,0 +1,77 @@
+using XML
+using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
+using Downloads: download
+using Test
+
+#==============================================================================#
+#                REMOTE XML FILE PARSING TESTS                                 #
+#==============================================================================#
+# These tests download publicly available XML files and verify that XML.jl can
+# parse them without error.  A failed download (network issues, CI without
+# internet, URL gone) is silently skipped — only parsing failures count as test
+# failures.
+#
+# Run standalone:  julia --project test/test_remote_files.jl
+
+function _try_download(url::AbstractString)::Union{String, Nothing}
+    try
+        path = download(url)
+        return read(path, String)
+    catch
+        return nothing
+    end
+end
+
+const REMOTE_XML_URLS = [
+    # ---- W3Schools example files ----
+    ("W3Schools note.xml",           "https://www.w3schools.com/xml/note.xml"),
+    ("W3Schools cd_catalog.xml",     "https://www.w3schools.com/xml/cd_catalog.xml"),
+    ("W3Schools plant_catalog.xml",  "https://www.w3schools.com/xml/plant_catalog.xml"),
+    ("W3Schools simple.xml",         "https://www.w3schools.com/xml/simple.xml"),
+    ("W3Schools books.xml",          "https://www.w3schools.com/xml/books.xml"),
+
+    # ---- W3C SVG samples ----
+    ("W3C SVG helloworld.svg",       "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/helloworld.svg"),
+    ("W3C SVG tiger.svg",            "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/tiger.svg"),
+    ("W3C SVG w3c.svg",              "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/w3c.svg"),
+    ("W3C SVG lineargradient2.svg",  "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/lineargradient2.svg"),
+    ("W3C SVG heart.svg",            "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/heart.svg"),
+
+    # ---- GitHub-hosted XML files ----
+    ("JUnit XML complete example",   "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-complete.xml"),
+    ("JUnit XML basic example",      "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-basic.xml"),
+    ("PEPPOL invoice base example",  "https://raw.githubusercontent.com/OpenPEPPOL/peppol-bis-invoice-3/master/rules/examples/base-example.xml"),
+
+    # ---- Maven Central POM (real-world XML with namespaces) ----
+    ("Maven JUnit 4.13.2 POM",      "https://repo1.maven.org/maven2/junit/junit/4.13.2/junit-4.13.2.pom"),
+    ("Maven Guava 33.0 POM",        "https://repo1.maven.org/maven2/com/google/guava/guava/33.0.0-jre/guava-33.0.0-jre.pom"),
+
+    # ---- NASA RSS feed (live XML) ----
+    ("NASA news RSS feed",           "https://www.nasa.gov/news-release/feed/"),
+]
+
+@testset "Remote XML Parsing" begin
+    for (label, url) in REMOTE_XML_URLS
+        @testset "$label" begin
+            xml_str = _try_download(url)
+            if isnothing(xml_str)
+                @info "Skipping $label — download failed" url
+                @test_skip false
+            else
+                doc = parse(xml_str, Node)
+                @test nodetype(doc) == Document
+                @test length(children(doc)) > 0
+
+                # Verify at least one Element exists somewhere in the document
+                has_element = any(x -> nodetype(x) == Element, children(doc))
+                @test has_element
+
+                # Verify write produces output and can be re-parsed
+                xml_out = XML.write(doc)
+                @test length(xml_out) > 0
+                doc2 = parse(xml_out, Node)
+                @test nodetype(doc2) == Document
+            end
+        end
+    end
+end
diff --git a/test/test_tokenizer.jl b/test/test_tokenizer.jl
new file mode 100644
index 0000000..7c901fa
--- /dev/null
+++ b/test/test_tokenizer.jl
@@ -0,0 +1,425 @@
+using Test, XML
+
+using XML.XMLTokenizer
+
+# Convenience: collect token kinds from a string
+kinds(xml) = [t.kind for t in tokenize(xml)]
+raws(xml)  = [String(t.raw) for t in tokenize(xml)]
+
+@testset "XMLTokenizer" begin
+
+#-----------------------------------------------------------------------# Basic text
+@testset "plain text" begin
+    toks = collect(tokenize("hello world"))
+    @test length(toks) == 1
+    @test toks[1].kind == TOKEN_TEXT
+    @test toks[1].raw == "hello world"
+end
+
+@testset "empty string" begin
+    @test isempty(collect(tokenize("")))
+end
+
+#-----------------------------------------------------------------------# Open tags
+@testset "open tag without attributes" begin
+    @test kinds("<div>") == [TOKEN_OPEN_TAG, TOKEN_TAG_CLOSE]
+    @test raws("<div>") == ["<div", ">"]
+end
+
+@testset "open tag with attributes" begin
+    xml = """<a href="url" class='main'>"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [
+        TOKEN_OPEN_TAG,
+        TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+        TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+        TOKEN_TAG_CLOSE,
+    ]
+    @test tag_name(toks[1]) == "a"
+    @test toks[2].raw == "href"
+    @test attr_value(toks[3]) == "url"
+    @test toks[4].raw == "class"
+    @test attr_value(toks[5]) == "main"
+end
+
+@testset "whitespace around =" begin
+    xml = """<x a = "1" >"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [
+        TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_TAG_CLOSE,
+    ]
+    @test attr_value(toks[3]) == "1"
+end
+
+#-----------------------------------------------------------------------# Self-closing tags
+@testset "self-closing tag" begin
+    @test kinds("<br/>") == [TOKEN_OPEN_TAG, TOKEN_SELF_CLOSE]
+    @test raws("<br/>") == ["<br", "/>"]
+end
+
+@testset "self-closing tag with attributes" begin
+    xml = """<img src="a.png" />"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [
+        TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_SELF_CLOSE,
+    ]
+    @test tag_name(toks[1]) == "img"
+    @test attr_value(toks[3]) == "a.png"
+end
+
+#-----------------------------------------------------------------------# Close tags
+@testset "close tag" begin
+    toks = collect(tokenize("</div>"))
+    @test [t.kind for t in toks] == [TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE]
+    @test tag_name(toks[1]) == "div"
+    @test toks[2].raw == ">"
+end
+
+@testset "close tag with whitespace" begin
+    toks = collect(tokenize("</div  >"))
+    @test [t.kind for t in toks] == [TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE]
+    @test tag_name(toks[1]) == "div"
+end
+
+#-----------------------------------------------------------------------# Open + close round-trip
+@testset "element with text" begin
+    xml = "<p>hello</p>"
+    @test kinds(xml) == [
+        TOKEN_OPEN_TAG, TOKEN_TAG_CLOSE,
+        TOKEN_TEXT,
+        TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE,
+    ]
+    toks = collect(tokenize(xml))
+    @test tag_name(toks[1]) == "p"
+    @test toks[3].raw == "hello"
+    @test tag_name(toks[4]) == "p"
+end
+
+#-----------------------------------------------------------------------# Namespaced tags
+@testset "namespaced tag" begin
+    xml = """<ns:el xmlns:ns="http://example.com">"""
+    toks = collect(tokenize(xml))
+    @test tag_name(toks[1]) == "ns:el"
+    @test toks[2].raw == "xmlns:ns"
+end
+
+#-----------------------------------------------------------------------# Comments
+@testset "comment" begin
+    xml = "<!-- hello -->"
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE]
+    @test toks[1].raw == "<!--"
+    @test toks[2].raw == " hello "
+    @test toks[3].raw == "-->"
+end
+
+@testset "empty comment" begin
+    toks = collect(tokenize("<!---->"))
+    @test [t.kind for t in toks] == [TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE]
+    @test toks[2].raw == ""
+end
+
+@testset "comment with markup-like content" begin
+    toks = collect(tokenize("<!-- <b>not</b> a tag -->"))
+    @test toks[2].raw == " <b>not</b> a tag "
+end
+
+#-----------------------------------------------------------------------# CDATA
+@testset "CDATA" begin
+    xml = "<![CDATA[raw & <text>]]>"
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE]
+    @test toks[1].raw == "<![CDATA["
+    @test toks[2].raw == "raw & <text>"
+    @test toks[3].raw == "]]>"
+end
+
+@testset "empty CDATA" begin
+    toks = collect(tokenize("<![CDATA[]]>"))
+    @test [t.kind for t in toks] == [TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE]
+    @test toks[2].raw == ""
+end
+
+#-----------------------------------------------------------------------# Processing instructions
+@testset "processing instruction" begin
+    xml = """<?style type="text/css"?>"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE]
+    @test toks[1].raw == "<?style"
+    @test pi_target(toks[1]) == "style"
+    @test toks[2].raw == """ type="text/css\""""
+    @test toks[3].raw == "?>"
+end
+
+@testset "PI with no content" begin
+    toks = collect(tokenize("<?target?>"))
+    @test [t.kind for t in toks] == [TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE]
+    @test pi_target(toks[1]) == "target"
+    @test toks[2].raw == ""
+end
+
+#-----------------------------------------------------------------------# XML declaration
+@testset "XML declaration" begin
+    xml = """<?xml version="1.0" encoding="UTF-8"?>"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [
+        TOKEN_XML_DECL_OPEN,
+        TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+        TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+        TOKEN_XML_DECL_CLOSE,
+    ]
+    @test pi_target(toks[1]) == "xml"
+    @test toks[1].raw == "<?xml"
+    @test toks[2].raw == "version"
+    @test attr_value(toks[3]) == "1.0"
+    @test toks[4].raw == "encoding"
+    @test attr_value(toks[5]) == "UTF-8"
+    @test toks[6].raw == "?>"
+end
+
+@testset "XML declaration with single quotes" begin
+    xml = "<?xml version='1.0'?>"
+    toks = collect(tokenize(xml))
+    @test toks[3].raw == "'1.0'"
+    @test attr_value(toks[3]) == "1.0"
+end
+
+#-----------------------------------------------------------------------# DOCTYPE
+@testset "DOCTYPE simple" begin
+    xml = """<!DOCTYPE note SYSTEM "note.dtd">"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE]
+    @test toks[1].raw == "<!DOCTYPE"
+    @test toks[2].raw == """ note SYSTEM "note.dtd\""""
+    @test toks[3].raw == ">"
+end
+
+@testset "DOCTYPE with internal subset" begin
+    xml = """<!DOCTYPE note [<!ELEMENT note (#PCDATA)>]>"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE]
+    @test toks[2].raw == " note [<!ELEMENT note (#PCDATA)>]"
+end
+
+@testset "DOCTYPE with quoted > in internal subset" begin
+    xml = """<!DOCTYPE note [<!ATTLIST x y CDATA "a>b">]>"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE]
+    @test occursin("a>b", toks[2].raw)
+end
+
+#-----------------------------------------------------------------------# Full document
+@testset "full document" begin
+    xml = """<?xml version="1.0"?>
+<!DOCTYPE root SYSTEM "root.dtd">
+<root>
+  <child id="1">text</child>
+  <empty/>
+  <!-- comment -->
+  <![CDATA[data]]>
+  <?pi content?>
+</root>"""
+    toks = collect(tokenize(xml))
+    tok_kinds = [t.kind for t in toks]
+
+    # XML declaration
+    @test tok_kinds[1] == TOKEN_XML_DECL_OPEN
+    # DOCTYPE present
+    @test TOKEN_DOCTYPE_OPEN in tok_kinds
+    # All open tags have matching closes
+    open_names  = [tag_name(t) for t in toks if t.kind == TOKEN_OPEN_TAG]
+    close_names = [tag_name(t) for t in toks if t.kind == TOKEN_CLOSE_TAG]
+    @test open_names == ["root", "child", "empty"]
+    @test close_names == ["child", "root"]
+    # CDATA is present
+    cdata_content = [t.raw for t in toks if t.kind == TOKEN_CDATA_CONTENT]
+    @test cdata_content == ["data"]
+    # Comment is present
+    comment_content = [t.raw for t in toks if t.kind == TOKEN_COMMENT_CONTENT]
+    @test comment_content == [" comment "]
+    # PI is present
+    pi_opens = [t for t in toks if t.kind == TOKEN_PI_OPEN]
+    @test length(pi_opens) == 1
+    @test pi_target(pi_opens[1]) == "pi"
+end
+
+#-----------------------------------------------------------------------# Raw round-trip
+@testset "concatenated raw reproduces input" begin
+    # Round-trip works for inputs where no whitespace/= is consumed between tokens.
+    # Whitespace around `=` in attributes is consumed and not part of any token.
+    for xml in [
+        """<!-- comment --><a/>""",
+        """<![CDATA[hello]]>""",
+        """<?pi data?>""",
+        """<!DOCTYPE x [<!ELEMENT x (#PCDATA)>]><x/>""",
+        """<p>text</p>""",
+    ]
+        reconstructed = join(t.raw for t in tokenize(xml))
+        @test reconstructed == xml
+    end
+end
+
+@testset "attribute whitespace is not preserved" begin
+    # Whitespace around `=` and between attrs is consumed, not emitted as tokens.
+    xml = """<a b = "c"  d='e' />"""
+    toks = collect(tokenize(xml))
+    @test [t.kind for t in toks] == [
+        TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+        TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_SELF_CLOSE,
+    ]
+end
+
+#-----------------------------------------------------------------------# Iterator protocol
+@testset "iterator protocol" begin
+    t = tokenize("<a/>")
+    @test Base.IteratorSize(typeof(t)) == Base.SizeUnknown()
+    @test Base.eltype(typeof(t)) == Token
+    toks = collect(t)
+    @test length(toks) == 2
+end
+
+#-----------------------------------------------------------------------# Utility error handling
+@testset "tag_name errors on wrong kind" begin
+    tok = first(tokenize("hello"))
+    @test_throws ArgumentError tag_name(tok)
+end
+
+@testset "attr_value errors on wrong kind" begin
+    tok = first(tokenize("<a>"))
+    @test_throws ArgumentError attr_value(tok)
+end
+
+@testset "pi_target errors on wrong kind" begin
+    tok = first(tokenize("<a>"))
+    @test_throws ArgumentError pi_target(tok)
+end
+
+#-----------------------------------------------------------------------# Error cases
+@testset "error: unterminated comment" begin
+    @test_throws ArgumentError collect(tokenize("<!-- no end"))
+end
+
+@testset "error: unterminated CDATA" begin
+    @test_throws ArgumentError collect(tokenize("<![CDATA[no end"))
+end
+
+@testset "error: unterminated PI" begin
+    @test_throws ArgumentError collect(tokenize("<?pi no end"))
+end
+
+@testset "unterminated open tag emits partial token" begin
+    # Tokenizer emits what it can; the tag is never closed but no error since EOF is reached
+    toks = collect(tokenize("<div"))
+    @test length(toks) == 1
+    @test toks[1].kind == TOKEN_OPEN_TAG
+    @test tag_name(toks[1]) == "div"
+end
+
+@testset "unterminated close tag emits partial token" begin
+    toks = collect(tokenize("</div"))
+    @test length(toks) == 1
+    @test toks[1].kind == TOKEN_CLOSE_TAG
+    @test tag_name(toks[1]) == "div"
+end
+
+@testset "error: unterminated attribute value" begin
+    @test_throws ArgumentError collect(tokenize("""<a b="no end"""))
+end
+
+@testset "error: unterminated DOCTYPE" begin
+    @test_throws ArgumentError collect(tokenize("<!DOCTYPE x"))
+end
+
+@testset "error: lone <" begin
+    @test_throws ArgumentError collect(tokenize("<"))
+end
+
+#-----------------------------------------------------------------------# Unicode content
+@testset "unicode text content" begin
+    xml = "<p>café ñ 日本語</p>"
+    toks = collect(tokenize(xml))
+    text_tok = toks[3]
+    @test text_tok.kind == TOKEN_TEXT
+    @test text_tok.raw == "café ñ 日本語"
+end
+
+@testset "unicode in attribute value" begin
+    xml = """<x a="über"/>"""
+    toks = collect(tokenize(xml))
+    @test attr_value(toks[3]) == "über"
+end
+
+@testset "unicode in comment" begin
+    toks = collect(tokenize("<!-- héllo -->"))
+    @test toks[2].raw == " héllo "
+end
+
+#-----------------------------------------------------------------------# Edge cases
+@testset "adjacent tags" begin
+    xml = "<a></a><b></b>"
+    toks = collect(tokenize(xml))
+    open_names  = [tag_name(t) for t in toks if t.kind == TOKEN_OPEN_TAG]
+    close_names = [tag_name(t) for t in toks if t.kind == TOKEN_CLOSE_TAG]
+    @test open_names == ["a", "b"]
+    @test close_names == ["a", "b"]
+    # No text tokens between them
+    @test !any(t -> t.kind == TOKEN_TEXT, toks)
+end
+
+@testset "text between adjacent tags" begin
+    xml = "<a>x</a>y<b/>"
+    texts = [t.raw for t in tokenize(xml) if t.kind == TOKEN_TEXT]
+    @test texts == ["x", "y"]
+end
+
+@testset "multiple attributes" begin
+    xml = """<div a="1" b="2" c="3">"""
+    names = [String(t.raw) for t in tokenize(xml) if t.kind == TOKEN_ATTR_NAME]
+    vals  = [String(attr_value(t)) for t in tokenize(xml) if t.kind == TOKEN_ATTR_VALUE]
+    @test names == ["a", "b", "c"]
+    @test vals == ["1", "2", "3"]
+end
+
+@testset "attribute with > in value" begin
+    xml = """<x a="1>2">"""
+    toks = collect(tokenize(xml))
+    @test attr_value(toks[3]) == "1>2"
+    @test toks[end].kind == TOKEN_TAG_CLOSE
+end
+
+@testset "attribute with single quotes" begin
+    xml = "<x a='val'>"
+    toks = collect(tokenize(xml))
+    @test toks[3].raw == "'val'"
+    @test attr_value(toks[3]) == "val"
+end
+
+@testset "mixed quote styles" begin
+    xml = """<x a="1" b='2'>"""
+    vals = [attr_value(t) for t in tokenize(xml) if t.kind == TOKEN_ATTR_VALUE]
+    @test vals == ["1", "2"]
+end
+
+@testset "whitespace-only text" begin
+    xml = "<a>  \n\t </a>"
+    texts = [t for t in tokenize(xml) if t.kind == TOKEN_TEXT]
+    @test length(texts) == 1
+    @test texts[1].raw == "  \n\t "
+end
+
+@testset "entities preserved verbatim" begin
+    xml = "<p>&amp; &lt; &#x41;</p>"
+    texts = [t.raw for t in tokenize(xml) if t.kind == TOKEN_TEXT]
+    @test texts == ["&amp; &lt; &#x41;"]
+end
+
+@testset "show method" begin
+    tok = first(tokenize("hello"))
+    buf = IOBuffer()
+    show(buf, tok)
+    s = String(take!(buf))
+    @test occursin("TOKEN_TEXT", s)
+    @test occursin("hello", s)
+end
+
+end # top-level testset

From 97384c3fb76c76866d0a865a2a7e8049786c132b Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 09:36:19 -0500
Subject: [PATCH 02/18] remove dead code

---
 src/XML.jl | 375 -----------------------------------------------------
 1 file changed, 375 deletions(-)

diff --git a/src/XML.jl b/src/XML.jl
index 8954140..609fd96 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -907,374 +907,6 @@ function parse_dtd(node::Node)
     parse_dtd(node.value)
 end
 
-# #-----------------------------------------------------------------------------# includes
-# include("raw.jl")
-# include("dtd.jl")
-
-# abstract type AbstractXMLNode end
-
-# #-----------------------------------------------------------------------------# LazyNode
-# """
-#     LazyNode(file::AbstractString)
-#     LazyNode(data::XML.Raw)
-
-# A Lazy representation of an XML node.
-# """
-# mutable struct LazyNode <: AbstractXMLNode
-#     raw::Raw
-#     tag::Union{Nothing, String}
-#     attributes::Union{Nothing, OrderedDict{String, String}}
-#     value::Union{Nothing, String}
-# end
-# LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing)
-
-# function Base.getproperty(o::LazyNode, x::Symbol)
-#     x === :raw && return getfield(o, :raw)
-#     x === :nodetype && return nodetype(o.raw)
-#     x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x)
-#     x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x)
-#     x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x)
-#     x === :depth && return depth(o.raw)
-#     x === :children && return LazyNode.(children(o.raw))
-#     error("type LazyNode has no field $(x)")
-# end
-# Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children)
-
-# Base.show(io::IO, o::LazyNode) = _show_node(io, o)
-
-# Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw))
-# Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw))
-# Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw))
-
-# children(o::LazyNode) = LazyNode.(children(o.raw))
-# parent(o::LazyNode) = LazyNode(parent(o.raw))
-# depth(o::LazyNode) = depth(o.raw)
-
-# Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown()
-# Base.eltype(::Type{LazyNode}) = LazyNode
-
-# function Base.iterate(o::LazyNode, state=o)
-#     n = next(state)
-#     return isnothing(n) ? nothing : (n, n)
-# end
-
-# function next(o::LazyNode)
-#     n = next(o.raw)
-#     isnothing(n) && return nothing
-#     n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n)
-# end
-# function prev(o::LazyNode)
-#     n = prev(o.raw)
-#     isnothing(n) && return nothing
-#     n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
-# end
-
-# #-----------------------------------------------------------------------------# Node
-# """
-#     Node(nodetype, tag, attributes, value, children)
-#     Node(node::Node; kw...)  # copy node with keyword overrides
-#     Node(node::LazyNode)  # un-lazy the LazyNode
-
-# A representation of an XML DOM node.  For simpler construction, use `(::NodeType)(args...)`
-# """
-# struct Node <: AbstractXMLNode
-#     nodetype::NodeType
-#     tag::Union{Nothing, String}
-#     attributes::Union{Nothing, OrderedDict{String, String}}
-#     value::Union{Nothing, String}
-#     children::Union{Nothing, Vector{Node}}
-
-#     function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing)
-#         new(nodetype,
-#             isnothing(tag) ? nothing : string(tag),
-#             isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)),
-#             isnothing(value) ? nothing : string(value),
-#             isnothing(children) ? nothing :
-#                 children isa Node ? [children] :
-#                 children isa Vector{Node} ? children :
-#                 children isa Vector ? map(Node, children) :
-#                 children isa Tuple ? map(Node, collect(children)) :
-#                 [Node(children)]
-#         )
-#     end
-# end
-
-# function Node(o::Node, x...; kw...)
-#     attrs = !isnothing(kw) ?
-#         merge(
-#             OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
-#             isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
-#         ) :
-#         o.attributes
-#     children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
-#     Node(o.nodetype, o.tag, attrs, o.value, children)
-# end
-
-# function Node(node::LazyNode)
-#     nodetype = node.nodetype
-#     tag = node.tag
-#     attributes = node.attributes
-#     value = node.value
-#     c = XML.children(node)
-#     Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
-# end
-
-# Node(data::Raw) = Node(LazyNode(data))
-
-# # Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node
-# Node(x) = Node(Text, nothing, nothing, string(x), nothing)
-
-# h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children)
-# Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
-# (o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...)
-
-# # NOT in-place for Text Nodes
-# function escape!(o::Node, warn::Bool=true)
-#     if o.nodetype == Text
-#         warn && @warn "escape!() called on a Text Node creates a new node."
-#         return Text(escape(o.value))
-#     end
-#     isnothing(o.children) && return o
-#     map!(x -> escape!(x, false), o.children, o.children)
-#     o
-# end
-# function unescape!(o::Node, warn::Bool=true)
-#     if o.nodetype == Text
-#         warn && @warn "unescape!() called on a Text Node creates a new node."
-#         return Text(unescape(o.value))
-#     end
-#     isnothing(o.children) && return o
-#     map!(x -> unescape!(x, false), o.children, o.children)
-#     o
-# end
-
-
-# Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
-# Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
-# Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw))
-
-# Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
-# Base.push!(a::Node, b::Node) = push!(a.children, b)
-# Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b)
-
-# Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val))
-# Base.getindex(o::Node, val::AbstractString) = o.attributes[val]
-# Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key)
-# Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes)
-
-# Base.show(io::IO, o::Node) = _show_node(io, o)
-
-# #-----------------------------------------------------------------------------# Node Constructors
-# function (T::NodeType)(args...; attr...)
-#     if T === Document
-#         !isempty(attr) && error("Document nodes do not have attributes.")
-#         Node(T, nothing, nothing, nothing, args)
-#     elseif T === DTD
-#         !isempty(attr) && error("DTD nodes only accept a value.")
-#         length(args) > 1 && error("DTD nodes only accept a value.")
-#         Node(T, nothing, nothing, only(args))
-#     elseif T === Declaration
-#         !isempty(args) && error("Declaration nodes only accept attributes")
-#         Node(T, nothing, attr)
-#     elseif T === ProcessingInstruction
-#         length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.")
-#         Node(T, only(args), attr)
-#     elseif T === Comment
-#         !isempty(attr) && error("Comment nodes do not have attributes.")
-#         length(args) > 1 && error("Comment nodes only accept a single input.")
-#         Node(T, nothing, nothing, only(args))
-#     elseif T === CData
-#         !isempty(attr) && error("CData nodes do not have attributes.")
-#         length(args) > 1 && error("CData nodes only accept a single input.")
-#         Node(T, nothing, nothing, only(args))
-#     elseif T === Text
-#         !isempty(attr) && error("Text nodes do not have attributes.")
-#         length(args) > 1 && error("Text nodes only accept a single input.")
-#         Node(T, nothing, nothing, only(args))
-#     elseif T === Element
-#         tag = first(args)
-#         Node(T, tag, attr, nothing, args[2:end])
-#     else
-#         error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).")
-#     end
-# end
-
-# #-----------------------------------------------------------------------------# !!! common !!!
-# # Everything below here is common to all data structures
-
-
-# #-----------------------------------------------------------------------------# interface fallbacks
-# nodetype(o) = o.nodetype
-# tag(o) = o.tag
-# attributes(o) = o.attributes
-# value(o) = o.value
-# children(o::T) where {T} = isnothing(o.children) ? () : o.children
-
-# depth(o) = missing
-# parent(o) = missing
-# next(o) = missing
-# prev(o) = missing
-
-# is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) &&
-#     length(children(o)) == 1 && nodetype(only(o)) in (Text, CData)
-
-# simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.")
-
-# Base.@deprecate_binding simplevalue simple_value
-
-# #-----------------------------------------------------------------------------# nodes_equal
-# function nodes_equal(a, b)
-#     out = XML.tag(a) == XML.tag(b)
-#     out &= XML.nodetype(a) == XML.nodetype(b)
-#     out &= XML.attributes(a) == XML.attributes(b)
-#     out &= XML.value(a) == XML.value(b)
-#     out &= length(XML.children(a)) == length(XML.children(b))
-#     out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b)))
-#     return out
-# end
-
-# Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b)
-
-# #-----------------------------------------------------------------------------# parse
-# Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T)
-
-# #-----------------------------------------------------------------------------# indexing
-# Base.getindex(o::Union{Raw, AbstractXMLNode}) = o
-# Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i]
-# Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o)
-# Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o))
-
-# Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o))
-
-# Base.length(o::AbstractXMLNode) = length(children(o))
-
-# #-----------------------------------------------------------------------------# printing
-# function _show_node(io::IO, o)
-#     printstyled(io, typeof(o), ' '; color=:light_black)
-#     !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black)
-#     printstyled(io, nodetype(o), ; color=:light_green)
-#     if o.nodetype === Text
-#         printstyled(io, ' ', repr(value(o)))
-#     elseif o.nodetype === Element
-#         printstyled(io, " <", tag(o), color=:light_cyan)
-#         _print_attrs(io, o; color=:light_yellow)
-#         printstyled(io, '>', color=:light_cyan)
-#         _print_n_children(io, o)
-#     elseif o.nodetype === DTD
-#         printstyled(io, " <!DOCTYPE "; color=:light_cyan)
-#         printstyled(io, value(o), color=:light_black)
-#         printstyled(io, '>', color=:light_cyan)
-#     elseif o.nodetype === Declaration
-#         printstyled(io, " <?xml", color=:light_cyan)
-#         _print_attrs(io, o; color=:light_yellow)
-#         printstyled(io, "?>", color=:light_cyan)
-#     elseif o.nodetype === ProcessingInstruction
-#         printstyled(io, " <?", tag(o), color=:light_cyan)
-#         _print_attrs(io, o; color=:light_yellow)
-#         printstyled(io, "?>", color=:light_cyan)
-#     elseif o.nodetype === Comment
-#         printstyled(io, " <!--", color=:light_cyan)
-#         printstyled(io, value(o), color=:light_black)
-#         printstyled(io, "-->", color=:light_cyan)
-#     elseif o.nodetype === CData
-#         printstyled(io, " <![CData[", color=:light_cyan)
-#         printstyled(io, value(o), color=:light_black)
-#         printstyled(io, "]]>", color=:light_cyan)
-#     elseif o.nodetype === Document
-#         _print_n_children(io, o)
-#     elseif o.nodetype === UNKNOWN
-#         printstyled(io, "Unknown", color=:light_cyan)
-#         _print_n_children(io, o)
-#     else
-#         error("Unreachable reached")
-#     end
-# end
-
-# function _print_attrs(io::IO, o; color=:normal)
-#     attr = attributes(o)
-#     isnothing(attr) && return nothing
-#     for (k,v) in attr
-#         # printstyled(io, ' ', k, '=', '"', v, '"'; color)
-#         print(io, ' ', k, '=', '"', v, '"')
-#     end
-# end
-# function _print_n_children(io::IO, o::Node)
-#     n = length(children(o))
-#     text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)"
-#     printstyled(io, text, color=:light_black)
-# end
-# _print_n_children(io::IO, o) = nothing
-
-# #-----------------------------------------------------------------------------# write_xml
-# write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
-
-# write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
-
-# function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
-#     indent = ' ' ^ indentsize
-#     nodetype = XML.nodetype(x)
-#     tag = XML.tag(x)
-#     value = XML.value(x)
-#     children = XML.children(x)
-
-#     padding = indent ^ max(0, depth - 1)
-#     !ctx[end] && print(io, padding)
-
-#     if nodetype === Text
-#         print(io, value)
-
-#     elseif nodetype === Element
-#         push!(ctx, ctx[end])
-#         update_ctx!(ctx, x)
-#         print(io, '<', tag)
-#         _print_attrs(io, x)
-#         print(io, isempty(children) ? '/' : "", '>')
-#         if !isempty(children)
-#             if length(children) == 1 && XML.nodetype(only(children)) === Text
-#                 write(io, only(children), ctx; indentsize=0)
-#                 print(io, "</", tag, '>')
-#             else
-#                 !ctx[end] && println(io)
-#                 foreach(children) do child
-#                     write(io, child, ctx; indentsize, depth=depth + 1)
-#                     !ctx[end] && println(io)
-#                 end
-#                 print(io, !ctx[end] ? padding : "", "</", tag, '>')
-#             end
-#         end
-#         pop!(ctx)
-
-#     elseif nodetype === DTD
-#         print(io, "<!DOCTYPE ", value, '>')
-
-#     elseif nodetype === Declaration
-#         print(io, "<?xml")
-#         _print_attrs(io, x)
-#         print(io, "?>")
-
-#     elseif nodetype === ProcessingInstruction
-#         print(io, "<?", tag)
-#         _print_attrs(io, x)
-#         print(io, "?>")
-
-#     elseif nodetype === Comment
-#         print(io, "<!--", value, "-->")
-
-#     elseif nodetype === CData
-#         print(io, "<![CData[", value, "]]>")
-
-#     elseif nodetype === Document
-#         foreach(children) do child
-#             write(io, child, ctx; indentsize)
-#             !ctx[end] && println(io)
-#         end
-
-#     else
-#         error("Unreachable case reached during XML.write")
-#     end
-
-# end
-
 #-----------------------------------------------------------------------------# deprecations
 Base.@deprecate_binding simplevalue simple_value false
 Base.@deprecate_binding LazyNode Node false
@@ -1288,13 +920,6 @@ struct Raw
         See `?XML.Node` for the new API.""")
 end
 
-struct AbstractXMLNode
-    AbstractXMLNode(args...; kw...) = error("""
-        `XML.AbstractXMLNode` has been removed in XML.jl v0.4.
-        `Node` is no longer a subtype of an abstract type.
-        Dispatch on `Node` directly instead.""")
-end
-
 # Removed functions — informative errors
 const _REMOVED_LAZYNODE_MSG = """
     This function was part of the LazyNode API, which has been removed in XML.jl v0.4.

From 1844b16637178163d50489b3d37a9b26be1816dc Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 09:46:07 -0500
Subject: [PATCH 03/18] more test files

---
 test/data/complex_dtd.xml | 105 ++++++++++++++++++++++++++++++++++++++
 test/data/preserve.xml    |   5 ++
 2 files changed, 110 insertions(+)
 create mode 100644 test/data/complex_dtd.xml
 create mode 100644 test/data/preserve.xml

diff --git a/test/data/complex_dtd.xml b/test/data/complex_dtd.xml
new file mode 100644
index 0000000..cb69747
--- /dev/null
+++ b/test/data/complex_dtd.xml
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!DOCTYPE test [
+<!-- ===== Bookstore DTD (complex demo) ===== -->
+
+<!-- Reusable parameter entities -->
+<!ENTITY % text  "(#PCDATA | em | code | xref | br)*">
+<!ENTITY % block "p | ul | ol | figure | table">
+
+<!-- Notations (used by unparsed entities) -->
+<!NOTATION jpeg SYSTEM "image/jpeg">
+<!NOTATION png  SYSTEM "image/png">
+
+<!-- Unparsed external entities (binary media) -->
+<!ENTITY cover1 SYSTEM "covers/b123.jpg" NDATA jpeg>
+<!ENTITY cover2 SYSTEM "covers/b456.png" NDATA png>
+
+<!ELEMENT catalog (metadata?, (book | magazine)+)>
+<!ATTLIST catalog
+          tier CDATA #FIXED "retail"
+          xml:lang CDATA #IMPLIED>
+
+<!ELEMENT metadata (publisher?, contact?)>
+<!ELEMENT publisher %text;>
+<!ELEMENT contact (email, phone?)>
+<!ELEMENT email (#PCDATA)>
+<!ELEMENT phone (#PCDATA)>
+
+<!ELEMENT book (title, subtitle?, authors, pubinfo, description?, section*, reviews?, related?)>
+<!ATTLIST book
+          id ID #REQUIRED
+          isbn CDATA #IMPLIED
+          format (hardcover | paperback | ebook) "paperback"
+          inStock (yes | no) #REQUIRED
+          xml:space (default | preserve) "default">
+
+<!ELEMENT magazine (title, issue, article+)>
+<!ATTLIST magazine id ID #REQUIRED>
+<!ELEMENT issue (#PCDATA)>
+
+<!ELEMENT title %text;>
+<!ELEMENT subtitle %text;>
+
+<!ELEMENT authors (author+)>
+<!ELEMENT author (name, affiliation?)>
+<!ATTLIST author id ID #IMPLIED>
+<!ELEMENT name %text;>
+<!ELEMENT affiliation %text;>
+
+<!ELEMENT pubinfo (publisher, year, price?, pages?)>
+<!ELEMENT year  (#PCDATA)>
+<!ELEMENT price (#PCDATA)>
+<!ELEMENT pages (#PCDATA)>
+
+<!ELEMENT description (%block;)*>
+
+<!ELEMENT section (title, (%block;)*, section*)>
+<!ATTLIST section id ID #IMPLIED>
+
+<!ELEMENT p  %text;>
+<!ELEMENT ul (li+)>
+<!ELEMENT ol (li+)>
+<!ELEMENT li %text;>
+
+<!ELEMENT figure (caption?, media)>
+<!ATTLIST figure
+          entity   ENTITY                 #IMPLIED   <!-- refers to cover1/cover2 -->
+          notation NOTATION (jpeg | png) #IMPLIED>
+<!ELEMENT caption %text;>
+<!ELEMENT media EMPTY>
+<!ATTLIST media
+          src CDATA #REQUIRED
+          alt CDATA #IMPLIED>
+
+<!ELEMENT table (thead?, tbody, tfoot?)>
+<!ELEMENT thead (tr+)>
+<!ELEMENT tbody (tr+)>
+<!ELEMENT tfoot (tr+)>
+<!ELEMENT tr (th | td)+>
+<!ELEMENT th %text;>
+<!ELEMENT td %text;>
+
+<!ELEMENT reviews (review+)>
+<!ELEMENT review (rating, p+)>
+<!ATTLIST review by IDREF #REQUIRED
+                  date CDATA #IMPLIED>
+<!ELEMENT rating EMPTY>
+<!ATTLIST rating value (poor | avg | good | excellent) #REQUIRED>
+
+<!ELEMENT related (seealso*)>
+<!ELEMENT seealso EMPTY>
+<!ATTLIST seealso ref IDREF #REQUIRED>
+
+<!ELEMENT article (title, authorrefs, p+)>
+<!ELEMENT authorrefs (authorref+)>
+<!ELEMENT authorref EMPTY>
+<!ATTLIST authorref ref IDREF #REQUIRED>
+
+<!-- Conditional section controlled by a parameter entity -->
+<!ENTITY % longdocs "INCLUDE">
+<![%longdocs;[
+  <!ELEMENT appendix (title, p+)>
+  <!ATTLIST appendix id ID #REQUIRED>
+]]>
+]>
diff --git a/test/data/preserve.xml b/test/data/preserve.xml
new file mode 100644
index 0000000..e77add1
--- /dev/null
+++ b/test/data/preserve.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root xml:space="preserve">
+    This node has preserved space
+    with <child xml:space="default">  default  </child> children.
+</root>

From b6f4d47112e630a9653e39a159c54d3caad0cfb0 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 10:32:05 -0500
Subject: [PATCH 04/18] Add validation tests and remove legacy DTD/raw code

---
 .gitignore            |   5 +-
 README.md             |  51 ++++
 src/dtd.jl            | 141 -----------
 src/raw.jl            | 568 ------------------------------------------
 test/Project.toml     |   1 +
 test/runtests.jl      |  47 +++-
 test/test_libexpat.jl | 389 +++++++++++++++++++++++++++++
 test/test_pugixml.jl  | 308 +++++++++++++++++++++++
 test/test_w3c.jl      | 154 ++++++++++++
 9 files changed, 952 insertions(+), 712 deletions(-)
 delete mode 100644 src/dtd.jl
 delete mode 100644 src/raw.jl
 create mode 100644 test/test_libexpat.jl
 create mode 100644 test/test_pugixml.jl
 create mode 100644 test/test_w3c.jl

diff --git a/.gitignore b/.gitignore
index 432a8b1..929dfc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *Manifest.toml
-*generated_xsd.jl
-*.xml
 *.gz
+*.tar
 *.DS_Store
 *.claude
+test/data/w3c/
+benchmarks/data/
diff --git a/README.md b/README.md
index b446f61..be72afc 100644
--- a/README.md
+++ b/README.md
@@ -197,3 +197,54 @@ XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, `'`)
 
 - `XML.escape(::String)` / `XML.unescape(::String)` -- transform strings.
 - `XML.escape!(::Node)` / `XML.unescape!(::Node)` -- transform an entire node tree in-place.
+
+<br>
+
+# Benchmarks
+
+Details in the benchmark file: [benchmarks.jl](benchmarks/benchmarks.jl)
+
+
+```
+============================================================
+  BENCHMARK RESULTS
+============================================================
+
+                      Parse (small) — median time (ms)
+
+     XML.jl  ■■■■■■ 0.031916
+      EzXML  ■■■■ 0.025125
+   LightXML  ■■■■■ 0.029959
+    XMLDict  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.23125
+
+
+                      Write (small) — median time (ms)
+
+     XML.jl  ■■■■■■■■■■■■ 0.027667
+      EzXML  ■■■■ 0.010375
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.093584
+
+
+                       Collect tags — median time (ms)
+
+     XML.jl  ■■■■■■ 0.000579122
+      EzXML  ■■■■■■■■■■■■■■■■■■■■■■ 0.0021084
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00360413
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+#   Official https://julialang.org release
+# Platform Info:
+#   OS: macOS (arm64-apple-darwin24.0.0)
+#   CPU: 10 × Apple M1 Pro
+#   WORD_SIZE: 64
+#   LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+#   GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+#   JULIA_NUM_THREADS = auto
+```
diff --git a/src/dtd.jl b/src/dtd.jl
deleted file mode 100644
index 58299f0..0000000
--- a/src/dtd.jl
+++ /dev/null
@@ -1,141 +0,0 @@
-# This is all a work in progress
-
-#-----------------------------------------------------------------------------# position_after
-function position_after(needle::Vector{UInt8}, haystack::Vector{UInt8}, i)
-    x = findnext(needle, haystack, i)
-    isnothing(x) ? nothing : x[end] + 1
-end
-
-position_after(needle::String, haystack::Vector{UInt8}, i) = position_after(Vector{UInt8}(needle), haystack, i)
-
-
-#-----------------------------------------------------------------------------# DeclaredElement
-struct DeclaredElement
-    name::String
-    content::String  # "ANY", "EMPTY", or "(children...)"
-    function DeclaredElement(name, content)
-        content in ("ANY", "EMPTY") || (content[1] == '('  && content[end] == ')') ||
-            error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'.  Got $content.")
-        new(name, content)
-    end
-end
-Base.show(io::IO, o::DeclaredElement) = print(io, "<!ELEMENT ", o.name, " ", o.content, ">")
-
-function get_declared_elements(data::Vector{UInt8})
-    i = position_after("<!ELEMENT", data, 1)
-    out = DeclaredElement[]
-    while !isnothing(i)
-        name, i = get_name(data, i + 1)
-        i = findnext(!isspace, data, i)
-        if data[i] == UInt8('(')
-            j = findnext(==(UInt8(')')), data, i + 1)
-            content = String(data[i:j])
-        else
-            content, i = get_name(data, i)
-        end
-        push!(out, DeclaredElement(name, content))
-        i = position_after("<!ELEMENT", data, i)
-    end
-    return out
-end
-
-#-----------------------------------------------------------------------------# DeclaredAttribute
-struct DeclaredAttribute
-    element_name::String
-    attribute_name::String
-    attribute_type::String
-    attribute_value::String
-end
-Base.show(io::IO, o::DeclaredAttribute) = print(io, "<!ATTLIST ", o.element_name, " ", o.attribute_name, " ", o.attribute_type, " ", o.attribute_value, ">")
-
-
-function get_declared_attributes(data)
-    i = position_after("<!ATTLIST", data, 1)
-    out = DeclaredAttribute[]
-    while !isnothing(i)
-        element_name, i = get_name(data, i)
-        attribute_name, i = get_name(data, i)
-        i = findnext(!isspace, data, i)
-        attribute_type = if data[i] == UInt('(')
-            j = findnext(==(UInt8(')')), data, i)
-            String(data[i:j])
-            i = j + 1
-        else
-            nm, i = get_name(data, i)
-            nm
-        end
-        i = findnext(!isspace, data, i)
-        is_hash = data[i] == UInt8('#')
-        val, i = get_name(data, i)
-        attribute_value = is_hash ? '#' * val : val
-        push!(out, DeclaredAttribute(element_name, attribute_name, attribute_type, attribute_value))
-        i = position_after("<!ATTLIST", data, i)
-    end
-    return out
-end
-
-#-----------------------------------------------------------------------------# DeclaredEntity
-struct DeclaredEntity
-    name::String
-    external::Bool
-    value::String
-end
-function Base.show(io::IO, o::DeclaredEntity)
-    print(io, "<!ENTITY ", o.name, " ", o.external ? "SYSTEM" : "", repr(o.value), ">")
-end
-
-function get_declared_entities(data)
-    i = position_after("<!ENTITY", data, 1)
-    out = DeclaredEntity[]
-    while !isnothing(i)
-        name, i = get_name(data, i)
-        value, i = get_name(data, i)
-        external = value == "SYSTEM"
-        if external
-            value, i = get_name(data, i)
-        end
-        push!(out, DeclaredEntity(name, external, value))
-        i = position_after("<!ENTITY", data, i)
-    end
-    return out
-end
-
-#-----------------------------------------------------------------------------# DTDBody
-struct DTDBody
-    elements::Vector{DeclaredElement}
-    attributes::Vector{DeclaredAttribute}
-    entities::Vector{DeclaredEntity}
-end
-
-function Base.show(io::IO, o::DTDBody)
-    printstyled(io, "DTDBody\n", color=:light_cyan)
-    printstyled(io, "   DeclaredElements (", length(o.elements), ")\n", color=:light_green)
-    foreach(x -> println(io, "        ", x), o.elements)
-    printstyled(io, "    DeclaredAttributes (", length(o.attributes), ")\n", color=:light_green)
-    foreach(x -> println(io, "        ", x), o.attributes)
-    printstyled(io, "    DeclaredEntities (", length(o.entities), ")\n", color=:light_green)
-    foreach(x -> println(io, "        ", x), o.entities)
-end
-
-
-function DTDBody(data::Vector{UInt8}, file = false)
-    file && @goto isfile
-    i = position_after("<!DOCTYPE", data, 1)
-    root, i = get_name(data, i)
-
-    i = findnext(==(UInt8('[')), data, i)
-    isnothing(i) && return DTDBody(root, [], [], [])
-
-    @label isfile
-    elements = get_declared_elements(data)
-    attributes = get_declared_attributes(data)
-    entities = get_declared_entities(data)
-    return DTDBody(root, elements, attributes, entities)
-end
-
-
-Base.read(filename::String, ::Type{DTDBody}) = DTDBody(read(filename), true)
-Base.read(io::IO, ::Type{DTDBody}) = DTDBody(read(io), true)
-
-Base.parse(s::AbstractString, ::Type{DTDBody}) = DTDBody(Vector{UInt8}(s))
-Base.parse(::Type{DTDBody}, s::AbstractString) = parse(s, DTDBody)
diff --git a/src/raw.jl b/src/raw.jl
deleted file mode 100644
index 29d0a10..0000000
--- a/src/raw.jl
+++ /dev/null
@@ -1,568 +0,0 @@
-#-----------------------------------------------------------------------------# RawType
-"""
-    RawType:
-    - RawText                   # text
-    - RawComment                # <!-- ... -->
-    - RawCData                  # <![CData[...]]>
-    - RawDeclaration            # <?xml attributes... ?>
-    - RawProcessingInstruction  # <?NAME attributes... ?>
-    - RawDTD                    # <!DOCTYPE ...>
-    - RawElementOpen            # <NAME attributes... >
-    - RawElementClose           # </NAME>
-    - RawElementSelfClosed      # <NAME attributes... />
-    - RawDocument               # Something to initialize with (not really used)
-"""
-@enum(RawType, RawDocument, RawText, RawComment, RawCData, RawProcessingInstruction,
-    RawDeclaration, RawDTD, RawElementOpen, RawElementClose, RawElementSelfClosed)
-
-@inline nodetype(x::RawType) =
-    x === RawElementOpen ? Element :
-    x === RawElementClose ? Element :
-    x === RawElementSelfClosed ? Element :
-    x === RawText ? Text :
-    x === RawComment ? Comment :
-    x === RawCData ? CData :
-    x === RawDeclaration ? Declaration :
-    x === RawDTD ? DTD :
-    x === RawProcessingInstruction ? ProcessingInstruction :
-    x === RawDocument ? Document :
-    nothing
-
-#-----------------------------------------------------------------------------# Raw
-"""
-    Raw(filename::String)
-
-Create an iterator over raw chunks of data in an XML file.  Each chunk of data represents one of:
-
-    - RawDocument                # Only used to initialize the iterator state.
-    - RawText                    # text
-    - RawComment                 # <!-- ... -->
-    - RawCData                   # <![CData[...]]>
-    - RawDeclaration             # <?xml attributes... ?>
-    - RawProcessingInstruction   # <?NAME attributes... ?>
-    - RawDTD                     # <!DOCTYPE ...>
-    - RawElementOpen             # <NAME attributes... >
-    - RawElementClose            # </NAME>
-    - RawElementSelfClosed       # <NAME attributes... />
-
-Useful functions:
-
-    - view(o::Raw) --> view of the Vector{UInt8} chunk.
-    - String(o::Raw) --> String of the chunk.
-    - next(o::Raw) --> Raw of the next chunk (or `nothing`).
-    - prev(o::Raw) --> Raw of the previous chunk (or `nothing`).
-    - tag(o::Raw) --> String of the tag name (or `nothing`).
-    - attributes(o::Raw) --> OrderedDict{String, String} of the attributes (or `nothing`).
-    - value(o::Raw) --> String of the value (or `nothing`).
-    - children(o::Raw) --> Vector{Raw} of the children (or `nothing`).
-    - parent(o::Raw) --> Raw of the parent (or `nothing`)
-    - depth(o::Raw) --> Int of the depth of the node in the XML DOM.
-"""
-struct Raw
-    type::RawType
-    depth::Int
-    pos::Int
-    len::Int
-    data::Vector{UInt8}
-    ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context)
-    has_xml_space::Bool # Whether data contains `xml:space` attribute at least once
-end
-function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
-    needle = Vector{UInt8}("xml:space")
-    has_xml_space = findfirst(needle, data) !== nothing
-    return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
-end
-function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false])
-    return Raw(RawDocument, 0, 0, 0, data, ctx, has_xml_space)
-end
-
-const _RAW_INDEX = WeakKeyDict{Vector{UInt8}, Any}()
-
-struct _TokRec
-    type::RawType
-    depth::Int
-    pos::Int
-    len::Int
-    ctx::Vector{Bool}
-end
-
-mutable struct _Index
-    recs::Vector{_TokRec}
-    last_raw::Raw
-    built_end::Int
-end
-
-Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
-                                           Raw(Mmap.mmap(filename)) :
-                                           error("File \"$filename\" does not exist.")
-
-Base.read(io::IO, ::Type{Raw}) = Raw(read(io))
-
-Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x))
-
-# Mostly for debugging
-Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)]))
-
-function Base.show(io::IO, o::Raw)
-    print(io, o.type, ':', o.depth, " (pos=", o.pos, ", len=", o.len, ")")
-    o.len > 0 && printstyled(io, ": ", String(o); color=:light_green)
-end
-function Base.:(==)(a::Raw, b::Raw)
-    a.type == b.type && a.depth == b.depth && a.pos == b.pos && a.len == b.len && a.data === b.data && a.ctx == b.ctx && a.has_xml_space == b.has_xml_space
-end
-
-Base.view(o::Raw) = view(o.data, o.pos:o.pos+o.len)
-Base.String(o::Raw) = String(view(o))
-
-Base.IteratorSize(::Type{Raw}) = Base.SizeUnknown()
-Base.eltype(::Type{Raw}) = Raw
-
-function Base.iterate(o::Raw, state=o)
-    n = next(state)
-    return isnothing(n) ? nothing : (n, n)
-end
-
-is_node(o::Raw) = o.type !== RawElementClose
-xml_nodes(o::Raw) = Iterators.Filter(is_node, o)
-
-#-----------------------------------------------------------------------------# get_name
-is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_')
-is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':')
-
-name_start(data, i) = findnext(is_name_start_char, data, i)
-name_stop(data, i) = findnext(!is_name_char, data, i) - 1
-
-function get_name(data, i)
-    i = name_start(data, i)
-    j = name_stop(data, i)
-    @views String(data[i:j]), j + 1
-end
-
-#-----------------------------------------------------------------------------# get_attributes
-# starting at position i, return attributes up until the next '>' or '?' (DTD)
-function get_attributes(data, i, j)
-    i = name_start(data, i)
-    (isnothing(j) || isnothing(i) || i > j) && return nothing
-    out = OrderedDict{String,String}()
-    while !isnothing(i) && i < j
-        key, i = get_name(data, i)
-        # get quotechar the value is wrapped in (either ' or ")
-        i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1)
-        quotechar = data[i]
-        i2 = findnext(==(quotechar), data, i + 1)
-        @views value = String(data[i+1:i2-1])
-        out[key] = value
-        i = name_start(data, i2)
-    end
-    return out
-end
-
-# ----------------------------------------------------------------------------# Utilities supporting prev
-function _get_or_init_index(o::Raw)
-    idx = get(_RAW_INDEX, o.data, nothing)
-    if idx === nothing
-        start = Raw(o.data)  # fresh RawDocument
-        _RAW_INDEX[o.data] = _Index(_TokRec[], start, 0)
-        idx = _RAW_INDEX[o.data]
-    end
-    return idx
-end
-function _ensure_index_upto!(o::Raw, target_pos::Int)
-    idx = _get_or_init_index(o)
-    r = idx.last_raw
-    while true
-        n = next(r)
-        if n === nothing
-            idx.built_end = typemax(Int)
-            idx.last_raw = r
-            return idx
-        end
-        push!(idx.recs, _TokRec(n.type, n.depth, n.pos, n.len, copy(n.ctx)))
-        endpos = n.pos + n.len
-        idx.built_end = endpos
-        idx.last_raw = n
-        r = n
-        if endpos >= target_pos
-            return idx
-        end
-    end
-end
-function _find_prev_token(recs::Vector{_TokRec}, p::Int)
-    lo, hi = 1, length(recs)
-    ans = 0
-    while lo <= hi
-        mid = (lo + hi) >>> 1
-        endpos = recs[mid].pos + recs[mid].len
-        if endpos < p + 1
-            ans = mid
-            lo = mid + 1
-        else
-            hi = mid - 1
-        end
-    end
-    return ans == 0 ? nothing : recs[ans]
-end
-
-#-----------------------------------------------------------------------------# update xml:space context
-# check attributes for xml:space and update ctx if necessary
-function get_ctx(o)
-    att = attributes(o)
-    if !isnothing(att) && haskey(att, "xml:space")
-        if att["xml:space"] == "preserve"
-            return true
-        elseif att["xml:space"] == "default"
-            return false
-        else
-            error("Invalid value for xml:space attribute: $(att["xml:space"]).  Must be 'preserve' or 'default'.")
-        end
-    end
-    return nothing
-end
-function update_ctx!(ctx, o)
-    new_ctx = get_ctx(o)
-    if new_ctx !== nothing
-        ctx[end] = new_ctx
-    end
-    return nothing
-end
-
-#-----------------------------------------------------------------------------# interface
-"""
-    nodetype(node) --> XML.NodeType
-
-Return the `XML.NodeType` of the node.
-"""
-nodetype(o::Raw) = nodetype(o.type)
-
-"""
-    tag(node) --> String or Nothing
-
-Return the tag name of `Element` and `PROCESSING_INSTRUCTION` nodes.
-"""
-function tag(o::Raw)
-    o.type ∉ [RawElementOpen, RawElementClose, RawElementSelfClosed, RawProcessingInstruction] && return nothing
-    return get_name(o.data, o.pos + 1)[1]
-end
-
-"""
-    attributes(node) --> OrderedDict{String, String} or Nothing
-
-Return the attributes of `Element`, `Declaration`, or `ProcessingInstruction` nodes.
-"""
-function attributes(o::Raw)
-    if o.type === RawElementOpen || o.type === RawElementSelfClosed || o.type === RawProcessingInstruction
-        i = o.pos
-        i = name_start(o.data, i)
-        i = name_stop(o.data, i)
-        get_attributes(o.data, i + 1, o.pos + o.len)
-    elseif o.type === RawDeclaration
-        get_attributes(o.data, o.pos + 6, o.pos + o.len)
-    else
-        nothing
-    end
-end
-
-"""
-    value(node) --> String or Nothing
-
-Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes.
-"""
-function value(o::Raw)
-    if o.type === RawText
-        String(o)
-    elseif o.type === RawCData
-        String(view(o.data, o.pos+length("<![CData["):o.pos+o.len-3))
-    elseif o.type === RawComment
-        String(view(o.data, o.pos+length("<!--"):o.pos+o.len-3))
-    elseif o.type === RawDTD
-        String(view(o.data, o.pos+length("<!DOCTYPE "):o.pos+o.len-1))
-    else
-        nothing
-    end
-end
-
-"""
-    children(node) --> Vector{typeof(node)}
-
-Return the children the node.  Will only be nonempty for `Element` and `Document` nodes.
-"""
-function children(o::Raw)
-    if o.type === RawElementOpen || o.type === RawDocument
-        depth = o.depth
-        out = Raw[]
-        for item in xml_nodes(o)
-            if item.depth == depth + 1
-                push!(out, item)
-            end
-            item.depth == depth && break
-            o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
-        end
-        out
-    else
-        Raw[]
-    end
-end
-
-"""
-    depth(node) --> Int
-
-Return the depth of the node.  Will be `0` for `Document` nodes.  Not defined for `XML.Node`.
-"""
-function depth(o::Raw)
-    o.depth
-end
-
-"""
-    parent(node) --> typeof(node), Nothing
-
-Return the parent of the node.  Will be `nothing` for `Document` nodes.  Not defined for `XML.Node`.
-"""
-function parent(o::Raw)
-    depth = o.depth
-    depth === 0 && return nothing
-    p = prev(o)
-    while p.depth >= depth
-        p = prev(p)
-    end
-    return p
-end
-
-#-----------------------------------------------------------------------------# next Raw
-# isspace(x::UInt8) = Base.isspace(Char(x))
-
-# XML whitespace per XML 1.0/1.1 production S:
-#   S ::= (#x20 | #x9 | #xD | #xA)+
-@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D)
-
-"""
-    next(node) --> typeof(node) or Nothing
-
-Return the next node in the document during depth-first traversal.  Depth-first is the order you
-would visit nodes by reading top-down through an XML file.  Not defined for `XML.Node`.
-"""
-function next(o::Raw)
-    if o.has_xml_space # using xml:space context at least once in data
-        return next_xml_space(o)
-    else # not using xml:space context at all (same as v0.3.5)
-        return next_no_xml_space(o)
-    end
-end
-
-function next_xml_space(o::Raw)
-    i = o.pos + o.len + 1
-    depth = o.depth
-    data = o.data
-    type = o.type
-    has_xml_space = o.has_xml_space
-    ctx = copy(o.ctx)
-    last_type = type
-    k = findnext(!xml_isspace, data, i)
-    if isnothing(k)
-        return nothing
-    end
-    if last_type === RawElementOpen || last_type === RawDocument
-        depth += 1
-        push!(ctx, ctx[end])  # inherit the xml:space context from parent
-        last_type === RawElementOpen && update_ctx!(ctx, o) # check attributes for xml:space and update if necessary
-    end
-    i = ctx[end] ? i : k
-    b = i > 1 ? Char(o.data[i-1]) : Char('<')
-    c = Char(o.data[i])
-    d = Char(o.data[k+1])
-    if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/'
-        type = RawText
-        j = findnext(==(UInt8('<')), data, i) - 1
-        j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed
-        if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument
-            # Maybe drop pure-whitespace inter-element text nodes?
-            # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node)
-            #if all(xml_isspace, @view data[i:j]) && depth > 1
-            #    return next(Raw(type, depth, j, 0, data, ctx, has_xml_space))
-            #end
-        end
-    else
-        i = k
-        j = k + 1
-        if c === '<'
-            c2 = Char(o.data[i+1])
-            if c2 === '!'
-                c3 = Char(o.data[i+2])
-                if c3 === '-'
-                    type = RawComment
-                    j = findnext(Vector{UInt8}("-->"), data, i)[end]
-                elseif c3 === '['
-                    type = RawCData
-                    j = findnext(Vector{UInt8}("]]>"), data, i)[end]
-                elseif c3 === 'D' || c3 == 'd'
-                    type = RawDTD
-                    j = findnext(==(UInt8('>')), data, i)
-                    while sum(==(UInt8('>')), @view data[k:j]) != sum(==(UInt8('<')), @view data[i:j])
-                        j = findnext(==(UInt8('>')), data, j + 1)
-                    end
-                end
-            elseif c2 === '?'
-                if get_name(data, i + 2)[1] == "xml"
-                    type = RawDeclaration
-                else
-                    type = RawProcessingInstruction
-                end
-                j = findnext(Vector{UInt8}("?>"), data, i)[end]
-            elseif c2 === '/'
-                type = RawElementClose
-                depth -= 1
-                pop!(ctx) # revert to parent xml:space context
-                j = findnext(==(UInt8('>')), data, i)
-            else
-                j = findnext(==(UInt8('>')), data, i)
-                if data[j-1] === UInt8('/')
-                    type = RawElementSelfClosed
-                else
-                    type = RawElementOpen
-                end
-            end
-        end
-    end
-    return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-function next_no_xml_space(o::Raw) # same as v0.3.5
-    i = o.pos + o.len + 1
-    depth = o.depth
-    data = o.data
-    type = o.type
-    has_xml_space = o.has_xml_space
-    ctx = [false]
-    i = findnext(!xml_isspace, data, i)
-    if isnothing(i)
-        return nothing
-    end
-    if type === RawElementOpen || type === RawDocument
-        depth += 1
-    end
-    c = Char(o.data[i])
-    d = Char(o.data[i+1])
-    if c !== '<'
-        type = RawText
-        j = findnext(==(UInt8('<')), data, i) - 1
-        j = findprev(!xml_isspace, data, j)   # "rstrip"
-    elseif c === '<'
-        c2 = Char(o.data[i+1])
-        if c2 === '!'
-            c3 = Char(o.data[i+2])
-            if c3 === '-'
-                type = RawComment
-                j = findnext(Vector{UInt8}("-->"), data, i)[end]
-            elseif c3 === '['
-                type = RawCData
-                j = findnext(Vector{UInt8}("]]>"), data, i)[end]
-            elseif c3 === 'D' || c3 == 'd'
-                type = RawDTD
-                j = findnext(==(UInt8('>')), data, i)
-                while sum(==(UInt8('>')), @view data[i:j]) != sum(==(UInt8('<')), @view data[i:j])
-                    j = findnext(==(UInt8('>')), data, j + 1)
-                end
-            end
-        elseif c2 === '?'
-            if get_name(data, i + 2)[1] == "xml"
-                type = RawDeclaration
-            else
-                type = RawProcessingInstruction
-            end
-            j = findnext(Vector{UInt8}("?>"), data, i)[end]
-        elseif c2 === '/'
-            type = RawElementClose
-            depth -= 1
-            j = findnext(==(UInt8('>')), data, i)
-        else
-            j = findnext(==(UInt8('>')), data, i)
-            if data[j-1] === UInt8('/')
-                type = RawElementSelfClosed
-            else
-                type = RawElementOpen
-            end
-        end
-    end
-    return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-#-----------------------------------------------------------------------------# prev Raw
-"""
-    prev(node) --> typeof(node), Nothing, or Missing (only for XML.Node)
-
-Return the previous node in the document during depth-first traversal.  Not defined for `XML.Node`.
-"""
-function prev(o::Raw)
-    if o.has_xml_space # using xml:space context at least once in data
-        return prev_xml_space(o)
-    else # not using xml:space context at all (same as v0.3.5)
-        return prev_no_xml_space(o)
-    end
-end
-
-function prev_xml_space(o::Raw)
-    o.type === RawDocument && return nothing
-
-    idx = _ensure_index_upto!(o, o.pos - 1)
-    rec = _find_prev_token(idx.recs, o.pos - 1)
-    if rec === nothing
-        return Raw(o.data, o.has_xml_space, copy(o.ctx))
-    end
-    return Raw(rec.type, rec.depth, rec.pos, rec.len, o.data, copy(rec.ctx), o.has_xml_space)
-end
-function prev_no_xml_space(o::Raw) # same as v0.3.5
-    depth = o.depth
-    data = o.data
-    type = o.type
-    has_xml_space = o.has_xml_space
-    ctx = has_xml_space ? copy(o.ctx) : [false]
-    type === RawDocument && return nothing
-    j = o.pos - 1
-    j = findprev(!xml_isspace, data, j)
-    if isnothing(j)
-        return Raw(data, has_xml_space, ctx)  # RawDocument
-    end
-    c = Char(o.data[j])
-    next_type = type
-    if c !== '>' # text
-        type = RawText
-        i = findprev(==(UInt8('>')), data, j) + 1
-        i = findnext(!xml_isspace, data, i)  # "lstrip"
-    elseif c === '>'
-        c2 = Char(o.data[j-1])
-        if c2 === '-'
-            type = RawComment
-            i = findprev(Vector{UInt8}("<--"), data, j)[1]
-        elseif c2 === ']'
-            type = RawCData
-            i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
-        elseif c2 === '?'
-            i = findprev(Vector{UInt8}("<?"), data, j)[1]
-            if get_name(data, i + 2)[1] == "xml"
-                type = RawDeclaration
-            else
-                type = RawProcessingInstruction
-            end
-        else
-            i = findprev(==(UInt8('<')), data, j)
-            char = Char(data[i+1])
-            if char === '/'
-                type = RawElementClose
-            elseif char === '!'
-                type = DTD
-            elseif isletter(char) || char === '_'
-                type = Char(o.data[j-2]) === '/' ? RawElementSelfClosed : RawElementOpen
-            else
-                error("Should be unreachable.  Unexpected data: <$char ... $c3$c2$c1>.")
-            end
-        end
-    else
-        error("Unreachable reached in XML.prev")
-    end
-    if type !== RawElementOpen && next_type === RawElementClose
-        depth += 1
-    elseif type === RawElementOpen && next_type !== RawElementClose
-        depth -= 1
-    end
-    return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
diff --git a/test/Project.toml b/test/Project.toml
index d4883bd..c1703f7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 1304245..ffc04c6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1119,8 +1119,9 @@ end
         joinpath(@__DIR__, "data", "xml.xsd"),
         joinpath(@__DIR__, "data", "kml.xsd"),
         joinpath(@__DIR__, "data", "books.xml"),
-        # example.kml uses invalid <![CData[...]]> (lowercase), skip it
+        # example.kml uses invalid <![CData[...]]> (lowercase), skip roundtrip
         joinpath(@__DIR__, "data", "simple_dtd.xml"),
+        joinpath(@__DIR__, "data", "preserve.xml"),
     ])
 
     for path in all_files
@@ -1202,6 +1203,50 @@ end
         @test length(dtd_nodes) == 1
         @test contains(value(dtd_nodes[1]), "ENTITY")
     end
+
+    @testset "preserve.xml" begin
+        path = joinpath(@__DIR__, "data", "preserve.xml")
+        isfile(path) || return
+        doc = read(path, Node)
+        @test nodetype(doc) == Document
+
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        @test tag(root) == "root"
+        @test root["xml:space"] == "preserve"
+
+        child_els = filter(x -> nodetype(x) == Element, children(root))
+        @test length(child_els) == 1
+        @test tag(child_els[1]) == "child"
+        @test child_els[1]["xml:space"] == "default"
+    end
+
+    @testset "example.kml" begin
+        # example.kml uses invalid <![CData[...]]> (lowercase 'd') which is not valid XML
+        path = joinpath(@__DIR__, "data", "example.kml")
+        isfile(path) || return
+        @test_throws ArgumentError read(path, Node)
+    end
+
+    @testset "tv.dtd" begin
+        path = joinpath(@__DIR__, "data", "tv.dtd")
+        isfile(path) || return
+        dtd_text = read(path, String)
+        pd = parse_dtd("TVSCHEDULE [\n" * dtd_text * "\n]")
+        @test pd.root == "TVSCHEDULE"
+
+        @test length(pd.elements) == 10
+        elem_names = map(e -> e.name, pd.elements)
+        @test "TVSCHEDULE" in elem_names
+        @test "CHANNEL" in elem_names
+        @test "PROGRAMSLOT" in elem_names
+        @test "TITLE" in elem_names
+
+        @test length(pd.attributes) == 5
+        attr_elements = map(a -> a.element, pd.attributes)
+        @test "TVSCHEDULE" in attr_elements
+        @test "CHANNEL" in attr_elements
+        @test "TITLE" in attr_elements
+    end
 end
 
 #==============================================================================#
diff --git a/test/test_libexpat.jl b/test/test_libexpat.jl
new file mode 100644
index 0000000..9ac8955
--- /dev/null
+++ b/test/test_libexpat.jl
@@ -0,0 +1,389 @@
+# Test cases inspired by libexpat (https://github.com/libexpat/libexpat, MIT license)
+# Translated from expat/tests/basic_tests.c
+
+using XML
+using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration, DTD
+using XML: tag, value, children, attributes, simple_value
+using Test
+
+@testset "libexpat-inspired" begin
+
+    #==========================================================================#
+    #                         Character References                             #
+    #==========================================================================#
+    @testset "Decimal character references" begin
+        doc = parse("<doc>&#233;&#232;</doc>", Node)
+        @test simple_value(children(doc)[1]) == "éè"
+    end
+
+    @testset "Hex character references" begin
+        doc = parse("<doc>&#xE9;&#xE8;</doc>", Node)
+        @test simple_value(children(doc)[1]) == "éè"
+    end
+
+    @testset "Mixed char refs and text" begin
+        doc = parse("<doc>abc&#100;ef</doc>", Node)
+        @test simple_value(children(doc)[1]) == "abcdef"
+    end
+
+    @testset "Large Unicode code points" begin
+        # CJK Unified Ideograph
+        doc = parse("<doc>&#x4E16;&#x754C;</doc>", Node)
+        @test simple_value(children(doc)[1]) == "世界"
+    end
+
+    #==========================================================================#
+    #                          UTF-8 Content                                   #
+    #==========================================================================#
+    @testset "UTF-8 BOM" begin
+        bom = "\xef\xbb\xbf"
+        doc = parse(bom * "<e/>", Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "UTF-8 element content" begin
+        doc = parse("<doc>Ünïcödé</doc>", Node)
+        @test simple_value(children(doc)[1]) == "Ünïcödé"
+    end
+
+    @testset "UTF-8 in attribute values" begin
+        doc = parse("<doc attr='café'/>", Node)
+        @test children(doc)[1]["attr"] == "café"
+    end
+
+    @testset "UTF-8 element names" begin
+        # XML.jl tokenizer does not yet support non-ASCII characters in element names
+        @test_broken try
+            parse("<données/>", Node)
+            true
+        catch
+            false
+        end
+    end
+
+    @testset "Multi-byte UTF-8 sequences" begin
+        # 2-byte: ñ (U+00F1)
+        doc = parse("<doc>ñ</doc>", Node)
+        @test simple_value(children(doc)[1]) == "ñ"
+
+        # 3-byte: 世 (U+4E16)
+        doc = parse("<doc>世</doc>", Node)
+        @test simple_value(children(doc)[1]) == "世"
+
+        # 4-byte: 𤭢 (U+24B62)
+        doc = parse("<doc>𤭢</doc>", Node)
+        @test simple_value(children(doc)[1]) == "𤭢"
+    end
+
+    #==========================================================================#
+    #                            CDATA                                         #
+    #==========================================================================#
+    @testset "Basic CDATA" begin
+        doc = parse("<a><![CDATA[<greeting>Hello!</greeting>]]></a>", Node)
+        root = children(doc)[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata) == 1
+        @test value(cdata[1]) == "<greeting>Hello!</greeting>"
+    end
+
+    @testset "CDATA with special characters" begin
+        doc = parse("<a><![CDATA[&<>\"']]></a>", Node)
+        root = children(doc)[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test value(cdata[1]) == "&<>\"'"
+    end
+
+    @testset "Multiple CDATA sections" begin
+        doc = parse("<a><![CDATA[first]]><![CDATA[second]]></a>", Node)
+        root = children(doc)[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata) == 2
+        @test value(cdata[1]) == "first"
+        @test value(cdata[2]) == "second"
+    end
+
+    @testset "CDATA containing ]]" begin
+        # ]] without > is valid inside CDATA
+        doc = parse("<a><![CDATA[data]]with]]brackets]]></a>", Node)
+        root = children(doc)[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test value(cdata[1]) == "data]]with]]brackets"
+    end
+
+    @testset "CDATA errors" begin
+        @test_throws Exception parse("<a><![CDATA[no end", Node)
+        @test_throws Exception parse("<a><![CDATA[", Node)
+    end
+
+    #==========================================================================#
+    #                          XML Declaration                                 #
+    #==========================================================================#
+    @testset "XML declaration" begin
+        doc = parse("<?xml version='1.0'?><doc/>", Node)
+        decls = filter(x -> nodetype(x) == Declaration, children(doc))
+        @test length(decls) == 1
+        @test decls[1]["version"] == "1.0"
+    end
+
+    @testset "XML declaration with encoding" begin
+        doc = parse("<?xml version='1.0' encoding='UTF-8'?><doc/>", Node)
+        decls = filter(x -> nodetype(x) == Declaration, children(doc))
+        @test decls[1]["encoding"] == "UTF-8"
+    end
+
+    @testset "XML declaration with standalone" begin
+        doc = parse("<?xml version='1.0' standalone='yes'?><doc/>", Node)
+        decls = filter(x -> nodetype(x) == Declaration, children(doc))
+        @test decls[1]["standalone"] == "yes"
+    end
+
+    @testset "Full XML declaration" begin
+        doc = parse("<?xml version='1.0' encoding='UTF-8' standalone='no'?><doc/>", Node)
+        decls = filter(x -> nodetype(x) == Declaration, children(doc))
+        @test decls[1]["version"] == "1.0"
+        @test decls[1]["encoding"] == "UTF-8"
+        @test decls[1]["standalone"] == "no"
+    end
+
+    #==========================================================================#
+    #                        Processing Instructions                           #
+    #==========================================================================#
+    @testset "Processing instructions" begin
+        doc = parse("<?mypi data?><doc/>", Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(pis) == 1
+
+        doc = parse("<doc><?inner-pi some data?></doc>", Node)
+        root = children(doc)[1]
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root))
+        @test length(pis) == 1
+    end
+
+    @testset "PI with no data" begin
+        doc = parse("<?mypi?><doc/>", Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(pis) == 1
+    end
+
+    #==========================================================================#
+    #                           Comments                                       #
+    #==========================================================================#
+    @testset "Comments in various positions" begin
+        # In prolog
+        doc = parse("<!-- prolog comment --><doc/>", Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) == 1
+
+        # Inside element
+        doc = parse("<doc><!-- inner --></doc>", Node)
+        root = children(doc)[1]
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test length(comments) == 1
+
+        # After root element
+        doc = parse("<doc/><!-- epilog -->", Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) == 1
+    end
+
+    @testset "Comment with special content" begin
+        doc = parse("<doc><!-- <not-an-element> &not-entity; --></doc>", Node)
+        root = children(doc)[1]
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test contains(value(comments[1]), "<not-an-element>")
+        @test contains(value(comments[1]), "&not-entity;")
+    end
+
+    #==========================================================================#
+    #                          DTD / DOCTYPE                                    #
+    #==========================================================================#
+    @testset "DOCTYPE with internal subset" begin
+        xml = """<!DOCTYPE doc [
+<!ELEMENT doc (#PCDATA)>
+<!ATTLIST doc attr CDATA #IMPLIED>
+]>
+<doc attr="value">text</doc>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc) == Document
+        dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc))
+        @test length(dtd_nodes) == 1
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        @test tag(root) == "doc"
+        @test root["attr"] == "value"
+        text_nodes = filter(x -> nodetype(x) == Text, children(root))
+        @test length(text_nodes) == 1
+        @test value(text_nodes[1]) == "text"
+    end
+
+    @testset "DOCTYPE with SYSTEM" begin
+        doc = parse("<!DOCTYPE doc SYSTEM 'test.dtd'><doc/>", Node)
+        dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc))
+        @test length(dtd_nodes) == 1
+    end
+
+    @testset "DOCTYPE with PUBLIC" begin
+        doc = parse("""<!DOCTYPE doc PUBLIC "-//Test//DTD Test//EN" "test.dtd"><doc/>""", Node)
+        dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc))
+        @test length(dtd_nodes) == 1
+    end
+
+    #==========================================================================#
+    #                         Entity Handling                                  #
+    #==========================================================================#
+    @testset "Predefined entities" begin
+        doc = parse("<doc>&lt;&gt;&amp;&apos;&quot;</doc>", Node)
+        @test simple_value(children(doc)[1]) == "<>&'\""
+    end
+
+    @testset "Entities in attribute values" begin
+        doc = parse("<doc attr='&lt;value&gt;'/>", Node)
+        @test children(doc)[1]["attr"] == "<value>"
+    end
+
+    @testset "Mixed entities and text" begin
+        doc = parse("<doc>Hello &amp; welcome &lt;user&gt;</doc>", Node)
+        @test simple_value(children(doc)[1]) == "Hello & welcome <user>"
+    end
+
+    #==========================================================================#
+    #                        Attribute Edge Cases                              #
+    #==========================================================================#
+    @testset "Empty attribute value" begin
+        doc = parse("<doc attr=''/>", Node)
+        @test children(doc)[1]["attr"] == ""
+
+        doc = parse("""<doc attr=""/>""", Node)
+        @test children(doc)[1]["attr"] == ""
+    end
+
+    @testset "Attribute with entities" begin
+        doc = parse("<doc attr='a&amp;b'/>", Node)
+        @test children(doc)[1]["attr"] == "a&b"
+    end
+
+    @testset "Multiple attributes" begin
+        doc = parse("""<doc a="1" b="2" c="3" d="4" e="5"/>""", Node)
+        el = children(doc)[1]
+        @test el["a"] == "1"
+        @test el["b"] == "2"
+        @test el["c"] == "3"
+        @test el["d"] == "4"
+        @test el["e"] == "5"
+    end
+
+    @testset "Attribute error: duplicate" begin
+        @test_throws Exception parse("""<doc attr="1" attr="2"/>""", Node)
+    end
+
+    #==========================================================================#
+    #                        Nesting & Structure                               #
+    #==========================================================================#
+    @testset "Deeply nested elements" begin
+        xml = "<a><b><c><d><e><f><g><h><i><j>deep</j></i></h></g></f></e></d></c></b></a>"
+        doc = parse(xml, Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "Many sibling elements" begin
+        items = join(["<item>$i</item>" for i in 1:100])
+        xml = "<root>$items</root>"
+        doc = parse(xml, Node)
+        root = children(doc)[1]
+        els = filter(x -> nodetype(x) == Element, children(root))
+        @test length(els) == 100
+        @test simple_value(els[1]) == "1"
+        @test simple_value(els[100]) == "100"
+    end
+
+    @testset "Mismatched tags" begin
+        @test_throws Exception parse("<a></b>", Node)
+        @test_throws Exception parse("<a><b></a></b>", Node)
+        @test_throws Exception parse("<a><b><c></b></c></a>", Node)
+    end
+
+    @testset "Unclosed elements" begin
+        @test_throws Exception parse("<a><b>", Node)
+        @test_throws Exception parse("<a>text", Node)
+    end
+
+    #==========================================================================#
+    #                           Line Endings                                   #
+    #==========================================================================#
+    @testset "Various line endings in content" begin
+        # CR, LF, CRLF should all work
+        doc = parse("<doc>line1\nline2</doc>", Node)
+        @test nodetype(doc) == Document
+
+        doc = parse("<doc>line1\rline2</doc>", Node)
+        @test nodetype(doc) == Document
+
+        doc = parse("<doc>line1\r\nline2</doc>", Node)
+        @test nodetype(doc) == Document
+    end
+
+    #==========================================================================#
+    #                          Empty Document Parts                            #
+    #==========================================================================#
+    @testset "Empty root element" begin
+        doc = parse("<doc/>", Node)
+        root = children(doc)[1]
+        @test tag(root) == "doc"
+        @test isempty(filter(x -> nodetype(x) == Element, children(root)))
+    end
+
+    @testset "Element with only whitespace" begin
+        doc = parse("<doc>   \n\t  </doc>", Node)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "Element with only comments" begin
+        doc = parse("<doc><!-- c1 --><!-- c2 --></doc>", Node)
+        root = children(doc)[1]
+        els = filter(x -> nodetype(x) == Element, children(root))
+        @test isempty(els)
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test length(comments) == 2
+    end
+
+    #==========================================================================#
+    #                       Namespace-like Attributes                          #
+    #==========================================================================#
+    @testset "xmlns declarations" begin
+        doc = parse("""<doc xmlns="http://example.com" xmlns:ns="http://example.com/ns"><ns:child/></doc>""", Node)
+        root = children(doc)[1]
+        @test root["xmlns"] == "http://example.com"
+        @test root["xmlns:ns"] == "http://example.com/ns"
+        els = filter(x -> nodetype(x) == Element, children(root))
+        @test tag(els[1]) == "ns:child"
+    end
+
+    @testset "Namespaced attributes" begin
+        doc = parse("""<doc xml:lang="en" xml:space="preserve"/>""", Node)
+        root = children(doc)[1]
+        @test root["xml:lang"] == "en"
+        @test root["xml:space"] == "preserve"
+    end
+
+    #==========================================================================#
+    #                        Large Content                                     #
+    #==========================================================================#
+    @testset "Long attribute value" begin
+        long_val = repeat("x", 10_000)
+        doc = parse("<doc attr='$long_val'/>", Node)
+        @test children(doc)[1]["attr"] == long_val
+    end
+
+    @testset "Long text content" begin
+        long_text = repeat("Hello World! ", 1000)
+        doc = parse("<doc>$long_text</doc>", Node)
+        @test simple_value(children(doc)[1]) == long_text
+    end
+
+    @testset "Long CDATA" begin
+        long_cdata = repeat("data<>& ", 1000)
+        doc = parse("<doc><![CDATA[$long_cdata]]></doc>", Node)
+        root = children(doc)[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test value(cdata[1]) == long_cdata
+    end
+end
diff --git a/test/test_pugixml.jl b/test/test_pugixml.jl
new file mode 100644
index 0000000..6e46d5a
--- /dev/null
+++ b/test/test_pugixml.jl
@@ -0,0 +1,308 @@
+# Test cases inspired by pugixml (https://github.com/zeux/pugixml, MIT license)
+# Translated from tests/test_parse.cpp and tests/test_xpath.cpp
+
+using XML
+using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration
+using XML: tag, value, children, attributes, simple_value, xpath
+using Test
+
+@testset "pugixml-inspired" begin
+
+    #==========================================================================#
+    #                        Processing Instructions                           #
+    #==========================================================================#
+    @testset "PI parsing" begin
+        doc = parse("<?pi?><root/>", Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(pis) == 1
+
+        doc = parse("<?pi value?><root/>", Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(pis) == 1
+
+        doc = parse("<?target  \r\n\t  value ?><root/>", Node)
+        pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc))
+        @test length(pis) == 1
+    end
+
+    @testset "PI errors" begin
+        # XML.jl is lenient about incomplete PIs without a root element,
+        # but these should fail when embedded in a document
+        @test_throws Exception parse("<root><?</root>", Node)
+        @test_throws Exception parse("<root><?name</root>", Node)
+    end
+
+    #==========================================================================#
+    #                              Comments                                    #
+    #==========================================================================#
+    @testset "Comment parsing" begin
+        doc = parse("<!----><root/>", Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test length(comments) == 1
+        @test value(comments[1]) == ""
+
+        doc = parse("<!--value--><root/>", Node)
+        comments = filter(x -> nodetype(x) == Comment, children(doc))
+        @test value(comments[1]) == "value"
+
+        doc = parse("<root><!--multi\nline\ncomment--></root>", Node)
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        comments = filter(x -> nodetype(x) == Comment, children(root))
+        @test contains(value(comments[1]), "multi")
+    end
+
+    @testset "Comment errors" begin
+        @test_throws Exception parse("<!-", Node)
+        @test_throws Exception parse("<root><!--</root>", Node)
+        @test_throws Exception parse("<!--->", Node)
+    end
+
+    #==========================================================================#
+    #                              CDATA                                       #
+    #==========================================================================#
+    @testset "CDATA parsing" begin
+        doc = parse("<root><![CDATA[]]></root>", Node)
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test length(cdata) == 1
+        @test value(cdata[1]) == ""
+
+        doc = parse("<root><![CDATA[value]]></root>", Node)
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test value(cdata[1]) == "value"
+
+        # CDATA preserves markup characters
+        doc = parse("<root><![CDATA[<greeting>Hello!</greeting>]]></root>", Node)
+        root = filter(x -> nodetype(x) == Element, children(doc))[1]
+        cdata = filter(x -> nodetype(x) == CData, children(root))
+        @test value(cdata[1]) == "<greeting>Hello!</greeting>"
+    end
+
+    @testset "CDATA errors" begin
+        @test_throws Exception parse("<root><![", Node)
+        @test_throws Exception parse("<root><![CDATA[", Node)
+        @test_throws Exception parse("<root><![CDATA[data", Node)
+    end
+
+    #==========================================================================#
+    #                           Tag Parsing                                    #
+    #==========================================================================#
+    @testset "Self-closing tags" begin
+        doc = parse("<node/>", Node)
+        @test tag(children(doc)[1]) == "node"
+
+        doc = parse("<node />", Node)
+        @test tag(children(doc)[1]) == "node"
+
+        doc = parse("<node\n/>", Node)
+        @test tag(children(doc)[1]) == "node"
+    end
+
+    @testset "Tag hierarchy" begin
+        doc = parse("<node><n1><n2/></n1><n3><n4><n5/></n4></n3></node>", Node)
+        root = children(doc)[1]
+        @test tag(root) == "node"
+        root_els = filter(x -> nodetype(x) == Element, children(root))
+        @test length(root_els) == 2
+        @test tag(root_els[1]) == "n1"
+        @test tag(root_els[2]) == "n3"
+    end
+
+    @testset "Tag errors" begin
+        @test_throws Exception parse("<", Node)
+        @test_throws Exception parse("<node", Node)
+        @test_throws Exception parse("<node></nodes>", Node)
+        @test_throws Exception parse("<node>", Node)
+        @test_throws Exception parse("</node>", Node)
+    end
+
+    #==========================================================================#
+    #                        Attribute Parsing                                 #
+    #==========================================================================#
+    @testset "Attribute quotes" begin
+        doc = parse("<node id1='v1' id2=\"v2\"/>", Node)
+        el = children(doc)[1]
+        @test el["id1"] == "v1"
+        @test el["id2"] == "v2"
+    end
+
+    @testset "Attribute spaces around =" begin
+        doc = parse("<node id1='v1' id2 ='v2' id3= 'v3' id4 = 'v4' />", Node)
+        el = children(doc)[1]
+        @test el["id1"] == "v1"
+        @test el["id2"] == "v2"
+        @test el["id3"] == "v3"
+        @test el["id4"] == "v4"
+    end
+
+    @testset "Attribute errors" begin
+        @test_throws Exception parse("<node id", Node)
+        @test_throws Exception parse("<node id='/>", Node)
+        @test_throws Exception parse("<node id='value", Node)
+    end
+
+    #==========================================================================#
+    #                        Entity/Escape Handling                            #
+    #==========================================================================#
+    @testset "Predefined entities in attributes" begin
+        doc = parse("<node id='&lt;&gt;&amp;&apos;&quot;'/>", Node)
+        @test children(doc)[1]["id"] == "<>&'\""
+    end
+
+    @testset "Predefined entities in text" begin
+        doc = parse("<node>&lt;&gt;&amp;&apos;&quot;</node>", Node)
+        @test simple_value(children(doc)[1]) == "<>&'\""
+    end
+
+    @testset "Numeric character references" begin
+        doc = parse("<node>&#32;&#x20;</node>", Node)
+        @test simple_value(children(doc)[1]) == "  "
+    end
+
+    @testset "Unicode character references" begin
+        # Greek gamma
+        doc = parse("<node>&#x03B3;</node>", Node)
+        @test simple_value(children(doc)[1]) == "γ"
+
+        # Same char, lowercase hex
+        doc = parse("<node>&#x03b3;</node>", Node)
+        @test simple_value(children(doc)[1]) == "γ"
+    end
+
+    #==========================================================================#
+    #                           Whitespace                                     #
+    #==========================================================================#
+    @testset "Whitespace text nodes preserved" begin
+        doc = parse("<root>  <node>  </node>  </root>", Node)
+        root = children(doc)[1]
+        # Should have text nodes with whitespace
+        text_nodes = filter(x -> nodetype(x) == Text, children(root))
+        @test length(text_nodes) >= 1
+    end
+
+    @testset "PCDATA content" begin
+        doc = parse("<root>text content</root>", Node)
+        @test simple_value(children(doc)[1]) == "text content"
+    end
+
+    #==========================================================================#
+    #                        Unicode / CJK Content                             #
+    #==========================================================================#
+    @testset "Unicode element names (CJK)" begin
+        # XML.jl tokenizer does not yet support CJK characters in element/attribute names
+        @test_broken try
+            parse("<汉语>世界</汉语>", Node)
+            true
+        catch
+            false
+        end
+    end
+
+    @testset "Unicode text content" begin
+        doc = parse("<doc>Ünïcödé café naïve</doc>", Node)
+        @test simple_value(children(doc)[1]) == "Ünïcödé café naïve"
+    end
+
+    #==========================================================================#
+    #                        Mixed Content                                     #
+    #==========================================================================#
+    @testset "Mixed text, CDATA, comments" begin
+        xml = "<node>First text<!-- comment -->Second text<![CDATA[cdata]]>Last text</node>"
+        doc = parse(xml, Node)
+        root = children(doc)[1]
+        child_types = map(nodetype, children(root))
+        @test Text in child_types
+        @test Comment in child_types
+        @test CData in child_types
+    end
+
+    #==========================================================================#
+    #                        Complex Document                                  #
+    #==========================================================================#
+    @testset "Complex document with all node types" begin
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE mesh SYSTEM "mesh.dtd">
+<!-- comment in prolog -->
+<?custom-pi data?>
+<mesh name="mesh_root">
+    <!-- inner comment -->
+    some text
+    <![CDATA[cdata content]]>
+    <node attr1="value1" attr2="value2" />
+    <node attr1="value2">
+        <innernode/>
+    </node>
+    <?include somedata?>
+</mesh>"""
+        doc = parse(xml, Node)
+        @test nodetype(doc) == Document
+
+        root_els = filter(x -> nodetype(x) == Element, children(doc))
+        @test length(root_els) == 1
+        mesh = root_els[1]
+        @test tag(mesh) == "mesh"
+        @test mesh["name"] == "mesh_root"
+
+        # Check inner content types
+        inner = children(mesh)
+        @test any(x -> nodetype(x) == Comment, inner)
+        @test any(x -> nodetype(x) == Text, inner)
+        @test any(x -> nodetype(x) == CData, inner)
+        @test any(x -> nodetype(x) == ProcessingInstruction, inner)
+
+        nodes = filter(x -> nodetype(x) == Element && tag(x) == "node", inner)
+        @test length(nodes) == 2
+        @test nodes[1]["attr1"] == "value1"
+        @test nodes[1]["attr2"] == "value2"
+    end
+
+    #==========================================================================#
+    #                             XPath                                        #
+    #==========================================================================#
+    @testset "XPath" begin
+        @testset "descendant with attribute predicate" begin
+            doc = parse("<a><b><c id='a'/></b><c id='b'/></a>", Node)
+            results = xpath(doc, "//c[@id='b']")
+            @test length(results) == 1
+            @test results[1]["id"] == "b"
+        end
+
+        @testset "child with attribute" begin
+            doc = parse("<a><b><c id='a'/></b><c id='b'/></a>", Node)
+            results = xpath(doc, "/a/c[@id]")
+            @test length(results) == 1
+            @test results[1]["id"] == "b"
+        end
+
+        @testset "wildcard with attribute predicate" begin
+            doc = parse("""<node><child1 attr1="v1" attr2="v2"/><child2 attr1="v1">test</child2></node>""", Node)
+            results = xpath(doc, "/node/*[@attr1]")
+            @test length(results) == 2
+        end
+
+        @testset "descendant-or-self with text()" begin
+            doc = parse("<a><b><c><d><e>deep</e></d></c></b></a>", Node)
+            results = xpath(doc, "//e/text()")
+            @test length(results) == 1
+            @test value(results[1]) == "deep"
+        end
+
+        @testset "positional predicate" begin
+            doc = parse("<root><a/><b/><c/></root>", Node)
+            results = xpath(doc, "/root/*[1]")
+            @test length(results) == 1
+            @test tag(results[1]) == "a"
+
+            results = xpath(doc, "/root/*[last()]")
+            @test length(results) == 1
+            @test tag(results[1]) == "c"
+        end
+
+        @testset "nested predicates" begin
+            doc = parse("""<node><child><subchild id="1"/></child><child><subchild id="2"/></child></node>""", Node)
+            results = xpath(doc, "//subchild[@id]")
+            @test length(results) == 2
+        end
+    end
+end
diff --git a/test/test_w3c.jl b/test/test_w3c.jl
new file mode 100644
index 0000000..16587ed
--- /dev/null
+++ b/test/test_w3c.jl
@@ -0,0 +1,154 @@
+# W3C XML Conformance Test Suite
+# https://www.w3.org/XML/Test/xmlts20130923.tar
+#
+# Test types:
+#   - "valid": well-formed XML that is also valid (should parse successfully)
+#   - "invalid": well-formed but not valid per DTD (should still parse — we're non-validating)
+#   - "not-wf": not well-formed XML (should fail to parse)
+#   - "error": optional errors (parser may or may not reject)
+#
+# We only run tests with ENTITIES="none" since XML.jl does not expand external entities.
+# We skip XML 1.1 tests (VERSION="1.1" or RECOMMENDATION="XML1.1").
+
+using XML
+using XML: Node, nodetype, Document
+using Test
+using Downloads: download
+using Tar
+
+const W3C_URL = "https://www.w3.org/XML/Test/xmlts20130923.tar"
+const W3C_DIR = joinpath(@__DIR__, "data", "w3c")
+const W3C_TAR = joinpath(@__DIR__, "data", "xmlts20130923.tar")
+
+function ensure_w3c_suite()
+    isdir(joinpath(W3C_DIR, "xmlconf")) && return
+    mkpath(W3C_DIR)
+    if !isfile(W3C_TAR)
+        @info "Downloading W3C XML Conformance Test Suite..."
+        download(W3C_URL, W3C_TAR)
+    end
+    @info "Extracting W3C XML Conformance Test Suite..."
+    open(W3C_TAR) do io
+        Tar.extract(io, W3C_DIR)
+    end
+end
+
+# Parse a test catalog XML and extract TEST entries
+function parse_catalog(catalog_path::String)
+    isfile(catalog_path) || return NamedTuple[]
+    doc = read(catalog_path, Node)
+    tests = NamedTuple[]
+    _collect_tests!(tests, doc, dirname(catalog_path))
+    return tests
+end
+
+function _collect_tests!(tests, node, base_dir)
+    for child in XML.children(node)
+        nodetype(child) !== XML.Element && continue
+        if XML.tag(child) == "TEST"
+            attrs = XML.attributes(child)
+            haskey(attrs, "URI") || continue
+            push!(tests, (
+                type = get(attrs, "TYPE", ""),
+                entities = get(attrs, "ENTITIES", ""),
+                id = get(attrs, "ID", ""),
+                uri = joinpath(base_dir, attrs["URI"]),
+                version = get(attrs, "VERSION", "1.0"),
+                recommendation = get(attrs, "RECOMMENDATION", ""),
+            ))
+        elseif XML.tag(child) == "TESTCASES"
+            # TESTCASES may have xml:base to adjust paths
+            sub_base = get(XML.attributes(child), "xml:base", "")
+            child_base = isempty(sub_base) ? base_dir : joinpath(base_dir, sub_base)
+            _collect_tests!(tests, child, child_base)
+        else
+            _collect_tests!(tests, child, base_dir)
+        end
+    end
+end
+
+function is_xml11(test)
+    test.version == "1.1" ||
+    test.recommendation == "XML1.1" ||
+    contains(test.recommendation, "XML1.1")
+end
+
+ensure_w3c_suite()
+
+# Catalogs for XML 1.0 tests
+const XMLCONF_DIR = joinpath(W3C_DIR, "xmlconf")
+const CATALOGS = filter(isfile, [
+    joinpath(XMLCONF_DIR, "xmltest", "xmltest.xml"),
+    joinpath(XMLCONF_DIR, "sun", "sun-valid.xml"),
+    joinpath(XMLCONF_DIR, "sun", "sun-invalid.xml"),
+    joinpath(XMLCONF_DIR, "sun", "sun-not-wf.xml"),
+    joinpath(XMLCONF_DIR, "sun", "sun-error.xml"),
+    joinpath(XMLCONF_DIR, "oasis", "oasis.xml"),
+    joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_not-wf.xml"),
+    joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_valid.xml"),
+    joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_invalid.xml"),
+    joinpath(XMLCONF_DIR, "eduni", "errata-2e", "errata2e.xml"),
+    joinpath(XMLCONF_DIR, "eduni", "errata-3e", "errata3e.xml"),
+    joinpath(XMLCONF_DIR, "eduni", "errata-4e", "errata4e.xml"),
+    joinpath(XMLCONF_DIR, "eduni", "namespaces", "1.0", "rmt-ns10.xml"),
+    joinpath(XMLCONF_DIR, "eduni", "misc", "ht-bh.xml"),
+    joinpath(XMLCONF_DIR, "japanese", "japanese.xml"),
+])
+
+# Collect all tests
+all_tests = NamedTuple[]
+for catalog in CATALOGS
+    append!(all_tests, parse_catalog(catalog))
+end
+
+# Filter: only ENTITIES="none", skip XML 1.1
+xml10_tests = filter(t -> t.entities == "none" && !is_xml11(t), all_tests)
+
+valid_tests = filter(t -> t.type in ("valid", "invalid"), xml10_tests)
+notwf_tests = filter(t -> t.type == "not-wf", xml10_tests)
+
+@info "W3C tests: $(length(valid_tests)) valid/invalid, $(length(notwf_tests)) not-wf (from $(length(all_tests)) total)"
+
+@testset "W3C Conformance" begin
+    @testset "Well-formed documents should parse" begin
+        n_pass = 0
+        n_fail = 0
+        failures = String[]
+        for test in valid_tests
+            isfile(test.uri) || continue
+            try
+                doc = read(test.uri, Node)
+                @test nodetype(doc) == Document
+                n_pass += 1
+            catch e
+                n_fail += 1
+                push!(failures, "$(test.id): $e")
+            end
+        end
+        if n_fail > 0
+            @warn "W3C well-formed: $n_pass passed, $n_fail failed" failures=first(failures, 20)
+        end
+        @info "W3C well-formed: $n_pass / $(n_pass + n_fail) passed"
+    end
+
+    @testset "Not-well-formed documents should fail to parse" begin
+        n_pass = 0
+        n_fail = 0
+        failures = String[]
+        for test in notwf_tests
+            isfile(test.uri) || continue
+            try
+                read(test.uri, Node)
+                n_fail += 1
+                push!(failures, test.id)
+            catch
+                @test true
+                n_pass += 1
+            end
+        end
+        if n_fail > 0
+            @warn "W3C not-well-formed: $n_pass rejected, $n_fail incorrectly accepted" failures=first(failures, 20)
+        end
+        @info "W3C not-well-formed: $n_pass / $(n_pass + n_fail) correctly rejected"
+    end
+end

From 21f647dde00402d07ae20b247012016b852866ad Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 10:39:58 -0500
Subject: [PATCH 05/18] Update CI actions and add validation tests

---
 .github/workflows/CI.yml | 4 ++--
 test/runtests.jl         | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5e073ac..03169ec 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -26,7 +26,7 @@ jobs:
           - os: macOS-latest
             arch: x86
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
@@ -44,6 +44,6 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v5
         with:
           file: lcov.info
diff --git a/test/runtests.jl b/test/runtests.jl
index ffc04c6..148488b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2725,3 +2725,7 @@ end
         @test length(results) == 3
     end
 end
+
+include("test_pugixml.jl")
+include("test_libexpat.jl")
+include("test_w3c.jl")

From c67342730550e106ddb4560429d29ed12f664878 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 10:47:20 -0500
Subject: [PATCH 06/18] update ci

---
 .github/workflows/CI.yml  | 6 +++++-
 test/test_remote_files.jl | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 03169ec..b52ab78 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -41,9 +41,13 @@ jobs:
             ${{ runner.os }}-test-${{ env.cache-name }}-
             ${{ runner.os }}-test-
             ${{ runner.os }}-
+      - uses: actions/cache@v4
+        with:
+          path: test/data/w3c
+          key: w3c-xmlconf-v20130923
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v5
         with:
-          file: lcov.info
+          files: lcov.info
diff --git a/test/test_remote_files.jl b/test/test_remote_files.jl
index 21af565..ed2b3e3 100644
--- a/test/test_remote_files.jl
+++ b/test/test_remote_files.jl
@@ -11,7 +11,7 @@ using Test
 # internet, URL gone) is silently skipped — only parsing failures count as test
 # failures.
 #
-# Run standalone:  julia --project test/test_remote_files.jl
+# Not included in runtests.jl — run standalone:  julia --project test/test_remote_files.jl
 
 function _try_download(url::AbstractString)::Union{String, Nothing}
     try

From 46c5a310c5331844dc34725ccff6fdc73099b016 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 11:47:26 -0500
Subject: [PATCH 07/18] Add XMark benchmark generator and expand benchmarks

---
 benchmarks/XMarkGenerator.jl | 381 +++++++++++++++++++++++++++++++++++
 benchmarks/benchmarks.jl     |  38 +++-
 2 files changed, 416 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/XMarkGenerator.jl

diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl
new file mode 100644
index 0000000..f0b497d
--- /dev/null
+++ b/benchmarks/XMarkGenerator.jl
@@ -0,0 +1,381 @@
+"""
+    XMarkGenerator
+
+XMark-inspired XML benchmark data generator.  Produces well-formed XML documents modeling an
+internet auction site, following the XMark benchmark DTD structure.
+
+    include("xml_generator.jl")
+    using .XMarkGenerator
+
+    xml = generate_xmark(1.0)               # return String (~14 MB)
+    generate_xmark("out.xml", 5.0)          # write to file (~68 MB)
+    generate_xmark(stdout, 0.1; seed=123)   # write to IO   (~1.4 MB)
+"""
+module XMarkGenerator
+
+using Random
+
+export generate_xmark
+
+#-----------------------------------------------------------------# Word lists
+const WORDS = [
+    "about", "above", "across", "after", "again", "against", "along", "already", "also",
+    "always", "among", "another", "answer", "around", "asked", "away", "back", "because",
+    "become", "been", "before", "began", "behind", "being", "below", "between", "body",
+    "book", "both", "brought", "build", "built", "business", "came", "cannot", "carry",
+    "cause", "certain", "change", "children", "city", "close", "come", "complete", "could",
+    "country", "course", "cover", "current", "dark", "days", "deep", "development",
+    "different", "direction", "does", "done", "door", "down", "draw", "during", "each",
+    "early", "earth", "east", "education", "effort", "eight", "either", "else", "end",
+    "enough", "even", "every", "example", "experience", "face", "fact", "family", "feel",
+    "field", "find", "first", "five", "follow", "food", "force", "form", "found", "four",
+    "from", "full", "gave", "general", "give", "going", "gone", "good", "government",
+    "great", "green", "ground", "group", "grow", "half", "hand", "happen", "hard", "have",
+    "head", "help", "here", "high", "himself", "hold", "home", "hope", "house", "however",
+    "hundred", "idea", "important", "inch", "include", "increase", "island", "just", "keep",
+    "kind", "knew", "know", "land", "large", "last", "later", "learn", "left", "less",
+    "letter", "life", "light", "like", "line", "list", "little", "live", "long", "look",
+    "lost", "made", "main", "make", "many", "mark", "matter", "mean", "might", "mind",
+    "miss", "money", "morning", "most", "mother", "move", "much", "music", "must", "name",
+    "near", "need", "never", "next", "night", "nothing", "notice", "number", "often",
+    "once", "only", "open", "order", "other", "over", "page", "paper", "part", "past",
+    "pattern", "people", "perhaps", "period", "person", "picture", "place", "plan", "plant",
+    "play", "point", "position", "possible", "power", "present", "problem", "produce",
+    "product", "program", "public", "pull", "purpose", "question", "quite", "reach", "read",
+    "real", "receive", "record", "remember", "rest", "result", "right", "river", "room",
+    "round", "rule", "same", "school", "second", "seem", "sentence", "service", "seven",
+    "several", "shall", "short", "should", "show", "side", "since", "sing", "size", "small",
+    "social", "some", "song", "soon", "south", "space", "stand", "start", "state", "still",
+    "stood", "story", "strong", "study", "such", "sure", "system", "table", "take", "tell",
+    "test", "their", "them", "then", "there", "these", "thing", "think", "those", "thought",
+    "three", "through", "time", "together", "took", "toward", "travel", "tree", "true",
+    "turn", "under", "unit", "until", "upon", "usually", "value", "very", "voice", "walk",
+    "want", "watch", "water", "well", "went", "were", "west", "what", "where", "which",
+    "while", "white", "whole", "will", "with", "without", "woman", "word", "work", "world",
+    "would", "write", "year", "young",
+]
+const FIRST_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard",
+    "Joseph", "Thomas", "Charles", "Mary", "Patricia", "Jennifer", "Linda", "Barbara",
+    "Elizabeth", "Susan", "Jessica", "Sarah", "Karen"]
+const LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller",
+    "Davis", "Rodriguez", "Martinez", "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez",
+    "Moore", "Martin", "Jackson", "Thompson", "White"]
+const COUNTRIES = ["United States", "Germany", "France", "Japan", "Australia", "Brazil",
+    "Canada", "India", "China", "Mexico", "Argentina", "Spain", "Italy", "United Kingdom",
+    "Netherlands", "Sweden", "Norway", "Finland", "Denmark", "Belgium"]
+const CITIES = ["New York", "London", "Paris", "Tokyo", "Sydney", "Berlin", "Rome",
+    "Madrid", "Amsterdam", "Toronto", "Moscow", "Beijing", "Seoul", "Mumbai", "Cairo",
+    "Dublin", "Prague", "Vienna", "Warsaw", "Budapest"]
+const STREETS = ["Main", "Oak", "Elm", "Maple", "Pine", "Cedar", "Birch", "Walnut",
+    "Cherry", "Ash", "Spruce", "Willow", "Poplar", "Laurel", "Juniper"]
+const EDUCATIONS = ["High School", "College", "Graduate", "Associate", "Master", "Doctorate"]
+const GENDERS = ["male", "female"]
+const PAYMENTS = ["Creditcard", "Money order", "Personal check", "Cash"]
+const SHIPPING = ["Will ship only within country", "Will ship internationally",
+    "Buyer pays fixed shipping costs", "Free shipping", "See description for shipping"]
+const REGIONS = ["africa", "asia", "australia", "europe", "namerica", "samerica"]
+
+#-----------------------------------------------------------------# Random data helpers
+rand_word(rng) = rand(rng, WORDS)
+rand_date(rng) = string(rand(rng, 1999:2025), "/", lpad(rand(rng, 1:12), 2, '0'), "/", lpad(rand(rng, 1:28), 2, '0'))
+rand_time(rng) = string(lpad(rand(rng, 0:23), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'))
+rand_price(rng) = string(rand(rng, 1:9999), ".", lpad(rand(rng, 0:99), 2, '0'))
+rand_phone(rng) = string("+", rand(rng, 1:99), " (", rand(rng, 100:999), ") ", rand(rng, 1000000:9999999))
+rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0'))
+rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ")
+rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com")
+
+function rand_id(prefix, n)
+    string(prefix, n)
+end
+
+#-----------------------------------------------------------------# XML writing helpers
+function xml_escape_char(io::IO, c::Char)
+    if c == '&';     print(io, "&amp;")
+    elseif c == '<'; print(io, "&lt;")
+    elseif c == '>'; print(io, "&gt;")
+    elseif c == '"'; print(io, "&quot;")
+    else;            print(io, c)
+    end
+end
+
+function write_escaped(io::IO, s::AbstractString)
+    for c in s
+        xml_escape_char(io, c)
+    end
+end
+
+function write_text_content(rng, io; min_words=10, max_words=50)
+    n = rand(rng, min_words:max_words)
+    for i in 1:n
+        i > 1 && print(io, ' ')
+        w = rand_word(rng)
+        r = rand(rng)
+        if r < 0.03
+            print(io, "<bold>", w, "</bold>")
+        elseif r < 0.06
+            print(io, "<emph>", w, "</emph>")
+        elseif r < 0.08
+            print(io, "<keyword>", w, "</keyword>")
+        else
+            print(io, w)
+        end
+    end
+end
+
+function write_description(rng, io, indent)
+    println(io, indent, "<description>")
+    if rand(rng) < 0.7
+        print(io, indent, "  <text>")
+        write_text_content(rng, io; min_words=15, max_words=80)
+        println(io, "</text>")
+    else
+        println(io, indent, "  <parlist>")
+        for _ in 1:rand(rng, 2:6)
+            print(io, indent, "    <listitem><text>")
+            write_text_content(rng, io; min_words=8, max_words=40)
+            println(io, "</text></listitem>")
+        end
+        println(io, indent, "  </parlist>")
+    end
+    println(io, indent, "</description>")
+end
+
+function write_annotation(rng, io, indent, n_people)
+    println(io, indent, "<annotation>")
+    println(io, indent, "  <author person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+    write_description(rng, io, string(indent, "  "))
+    println(io, indent, "  <happiness>", rand(rng, 1:10), "</happiness>")
+    println(io, indent, "</annotation>")
+end
+
+#-----------------------------------------------------------------# Section writers
+function write_item(rng, io, id, n_categories)
+    featured = rand(rng) < 0.1 ? " featured=\"yes\"" : ""
+    println(io, "      <item id=\"", rand_id("item", id), "\"", featured, ">")
+    println(io, "        <location>", rand(rng, CITIES), "</location>")
+    println(io, "        <quantity>", rand(rng, 1:50), "</quantity>")
+    println(io, "        <name>", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), "</name>")
+    println(io, "        <payment>", rand(rng, PAYMENTS), "</payment>")
+    write_description(rng, io, "        ")
+    println(io, "        <shipping>", rand(rng, SHIPPING), "</shipping>")
+    for _ in 1:rand(rng, 1:3)
+        println(io, "        <incategory category=\"", rand_id("category", rand(rng, 1:n_categories)), "\"/>")
+    end
+    println(io, "        <mailbox>")
+    for _ in 1:rand(rng, 0:5)
+        println(io, "          <mail>")
+        println(io, "            <from>", rand_email(rng), "</from>")
+        println(io, "            <to>", rand_email(rng), "</to>")
+        println(io, "            <date>", rand_date(rng), "</date>")
+        print(io, "            <text>")
+        write_text_content(rng, io; min_words=10, max_words=60)
+        println(io, "</text>")
+        println(io, "          </mail>")
+    end
+    println(io, "        </mailbox>")
+    println(io, "      </item>")
+end
+
+function write_categories(rng, io, n)
+    println(io, "  <categories>")
+    for i in 1:n
+        println(io, "    <category id=\"", rand_id("category", i), "\">")
+        println(io, "      <name>", rand_word(rng), " ", rand_word(rng), "</name>")
+        write_description(rng, io, "      ")
+        println(io, "    </category>")
+    end
+    println(io, "  </categories>")
+end
+
+function write_catgraph(rng, io, n_edges, n_categories)
+    println(io, "  <catgraph>")
+    for _ in 1:n_edges
+        from = rand_id("category", rand(rng, 1:n_categories))
+        to = rand_id("category", rand(rng, 1:n_categories))
+        println(io, "    <edge from=\"", from, "\" to=\"", to, "\"/>")
+    end
+    println(io, "  </catgraph>")
+end
+
+function write_people(rng, io, n, n_categories, n_open)
+    println(io, "  <people>")
+    for i in 1:n
+        println(io, "    <person id=\"", rand_id("person", i), "\">")
+        println(io, "      <name>", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), "</name>")
+        println(io, "      <emailaddress>", rand_email(rng), "</emailaddress>")
+        if rand(rng) < 0.8
+            println(io, "      <phone>", rand_phone(rng), "</phone>")
+        end
+        if rand(rng) < 0.7
+            println(io, "      <address>")
+            println(io, "        <street>", rand(rng, 1:9999), " ", rand(rng, STREETS), " St</street>")
+            println(io, "        <city>", rand(rng, CITIES), "</city>")
+            println(io, "        <country>", rand(rng, COUNTRIES), "</country>")
+            if rand(rng) < 0.5
+                println(io, "        <province>", rand_word(rng), "</province>")
+            end
+            println(io, "        <zipcode>", rand_zip(rng), "</zipcode>")
+            println(io, "      </address>")
+        end
+        if rand(rng) < 0.5
+            println(io, "      <homepage>http://www.", lowercase(rand(rng, LAST_NAMES)), ".com/~",
+                lowercase(rand(rng, FIRST_NAMES)), "</homepage>")
+        end
+        if rand(rng) < 0.6
+            println(io, "      <creditcard>", rand_cc(rng), "</creditcard>")
+        end
+        if rand(rng) < 0.7
+            income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : ""
+            println(io, "      <profile", income, ">")
+            for _ in 1:rand(rng, 0:4)
+                println(io, "        <interest category=\"", rand_id("category", rand(rng, 1:n_categories)), "\"/>")
+            end
+            if rand(rng) < 0.8
+                println(io, "        <education>", rand(rng, EDUCATIONS), "</education>")
+            end
+            if rand(rng) < 0.7
+                println(io, "        <gender>", rand(rng, GENDERS), "</gender>")
+            end
+            println(io, "        <business>", rand_word(rng), "</business>")
+            if rand(rng) < 0.8
+                println(io, "        <age>", rand(rng, 18:85), "</age>")
+            end
+            println(io, "      </profile>")
+        end
+        if n_open > 0 && rand(rng) < 0.3
+            println(io, "      <watches>")
+            for _ in 1:rand(rng, 1:5)
+                println(io, "        <watch open_auction=\"", rand_id("open_auction", rand(rng, 1:n_open)), "\"/>")
+            end
+            println(io, "      </watches>")
+        end
+        println(io, "    </person>")
+    end
+    println(io, "  </people>")
+end
+
+function write_open_auctions(rng, io, n, n_items, n_people)
+    println(io, "  <open_auctions>")
+    for i in 1:n
+        println(io, "    <open_auction id=\"", rand_id("open_auction", i), "\">")
+        println(io, "      <initial>", rand_price(rng), "</initial>")
+        if rand(rng) < 0.5
+            println(io, "      <reserve>", rand_price(rng), "</reserve>")
+        end
+        for _ in 1:rand(rng, 0:12)
+            println(io, "      <bidder>")
+            println(io, "        <date>", rand_date(rng), "</date>")
+            println(io, "        <time>", rand_time(rng), "</time>")
+            println(io, "        <personref person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+            println(io, "        <increase>", rand_price(rng), "</increase>")
+            println(io, "      </bidder>")
+        end
+        println(io, "      <current>", rand_price(rng), "</current>")
+        if rand(rng) < 0.3
+            println(io, "      <privacy>", rand(rng, ["Yes", "No"]), "</privacy>")
+        end
+        println(io, "      <itemref item=\"", rand_id("item", rand(rng, 1:n_items)), "\"/>")
+        println(io, "      <seller person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+        write_annotation(rng, io, "      ", n_people)
+        println(io, "      <quantity>", rand(rng, 1:10), "</quantity>")
+        println(io, "      <type>", rand(rng, ["Regular", "Featured"]), "</type>")
+        println(io, "      <interval>")
+        println(io, "        <start>", rand_date(rng), "</start>")
+        println(io, "        <end>", rand_date(rng), "</end>")
+        println(io, "      </interval>")
+        println(io, "    </open_auction>")
+    end
+    println(io, "  </open_auctions>")
+end
+
+function write_closed_auctions(rng, io, n, n_open, n_items, n_people)
+    println(io, "  <closed_auctions>")
+    for i in 1:n
+        println(io, "    <closed_auction>")
+        println(io, "      <seller person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+        println(io, "      <buyer person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+        # Use item IDs that don't overlap with open auctions
+        item_id = n_open + i
+        item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items)
+        println(io, "      <itemref item=\"", rand_id("item", item_id), "\"/>")
+        println(io, "      <price>", rand_price(rng), "</price>")
+        println(io, "      <date>", rand_date(rng), "</date>")
+        println(io, "      <quantity>", rand(rng, 1:10), "</quantity>")
+        println(io, "      <type>", rand(rng, ["Regular", "Featured"]), "</type>")
+        if rand(rng) < 0.7
+            write_annotation(rng, io, "      ", n_people)
+        end
+        println(io, "    </closed_auction>")
+    end
+    println(io, "  </closed_auctions>")
+end
+
+#-----------------------------------------------------------------# Main entry points
+"""
+    generate_xmark([io_or_filename], factor; seed=42)
+
+Generate an XMark-style auction XML document.  `factor` scales all entity counts linearly.
+
+Approximate output sizes (may vary slightly):
+- `factor=0.1`  → ~1.4 MB
+- `factor=1.0`  → ~14 MB
+- `factor=2.0`  → ~27 MB
+- `factor=5.0`  → ~68 MB
+"""
+function generate_xmark(io::IO, factor::Real; seed::Int=42)
+    factor > 0 || throw(ArgumentError("factor must be positive, got $factor"))
+    rng = Xoshiro(seed)
+
+    n_per_region = max(1, round(Int, 500  * factor))
+    n_people     = max(1, round(Int, 5000 * factor))
+    n_categories = max(1, round(Int, 200  * factor))
+    n_open       = max(1, round(Int, 2000 * factor))
+    n_closed     = max(1, round(Int, 1500 * factor))
+    n_edges      = max(1, round(Int, 1000 * factor))
+    n_items      = n_per_region * 6
+
+    # Clamp auctions to available items
+    n_open   = min(n_open, n_items)
+    n_closed = min(n_closed, max(1, n_items - n_open))
+
+    println(io, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
+    println(io, "<site>")
+
+    # Regions with items
+    println(io, "  <regions>")
+    item_id = 0
+    for region in REGIONS
+        println(io, "    <", region, ">")
+        for _ in 1:n_per_region
+            item_id += 1
+            write_item(rng, io, item_id, n_categories)
+        end
+        println(io, "    </", region, ">")
+    end
+    println(io, "  </regions>")
+
+    write_categories(rng, io, n_categories)
+    write_catgraph(rng, io, n_edges, n_categories)
+    write_people(rng, io, n_people, n_categories, n_open)
+    write_open_auctions(rng, io, n_open, n_items, n_people)
+    write_closed_auctions(rng, io, n_closed, n_open, n_items, n_people)
+
+    println(io, "</site>")
+    nothing
+end
+
+function generate_xmark(filename::AbstractString, factor::Real; seed::Int=42)
+    open(filename, "w") do io
+        generate_xmark(io, factor; seed)
+    end
+    filename
+end
+
+function generate_xmark(factor::Real; seed::Int=42)
+    io = IOBuffer()
+    generate_xmark(io, factor; seed)
+    String(take!(io))
+end
+
+end # module
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
index aa558b9..7aecc76 100644
--- a/benchmarks/benchmarks.jl
+++ b/benchmarks/benchmarks.jl
@@ -7,6 +7,9 @@ using BenchmarkTools
 using DataFrames
 using UnicodePlots
 
+include("XMarkGenerator.jl")
+using .XMarkGenerator
+
 BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
 BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
 
@@ -15,6 +18,15 @@ BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
 small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml")
 small_xml = read(small_file, String)
 
+# Medium file (generated XMark auction XML, ~14 MB)
+medium_file = joinpath(@__DIR__, "data", "xmark.xml")
+if !isfile(medium_file)
+    mkpath(dirname(medium_file))
+    @info "Generating XMark benchmark XML..."
+    generate_xmark(medium_file, 1.0)
+end
+medium_xml = read(medium_file, String)
+
 df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
 
 macro add_benchmark(kind, name, expr...)
@@ -31,11 +43,27 @@ end
 @add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml)
 @add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml)
 
+#-----------------------------------------------------------------------------# Parse (medium)
+@add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node)
+@add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml)
+@add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml)
+@add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml)
+
 #-----------------------------------------------------------------------------# Write (small)
 @add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node))
 @add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml))
 @add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
 
+#-----------------------------------------------------------------------------# Write (medium)
+@add_benchmark "Write (medium)" "XML.jl" XML.write(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Write (medium)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Write (medium)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(medium_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Read from file
+@add_benchmark "Read file" "XML.jl" read($medium_file, Node)
+@add_benchmark "Read file" "EzXML" EzXML.readxml($medium_file)
+@add_benchmark "Read file" "LightXML" LightXML.parse_file($medium_file)
+
 #-----------------------------------------------------------------------------# Collect element tags
 function xml_collect_tags(node)
     out = String[]
@@ -75,9 +103,13 @@ function _lightxml_collect_tags!(out, el::LightXML.XMLElement)
     end
 end
 
-@add_benchmark "Collect tags" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node))
-@add_benchmark "Collect tags" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml))
-@add_benchmark "Collect tags" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o))
+@add_benchmark "Collect tags (small)" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Collect tags (small)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Collect tags (small)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o))
+
+@add_benchmark "Collect tags (medium)" "XML.jl" xml_collect_tags(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o))
 
 #-----------------------------------------------------------------------------# Results
 function plot_group(df, kind)

From 33bcf35a40cc490ac4f5ea4f932e9292d2bde0f8 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 20:56:49 -0500
Subject: [PATCH 08/18] Add LazyNode type and StringViews extension

---
 Project.toml             |  12 +-
 ext/XMLStringViewsExt.jl |  26 ++++
 src/XML.jl               |  12 +-
 src/lazynode.jl          | 258 +++++++++++++++++++++++++++++++++++++++
 src/tokenizer.jl         |  19 +--
 5 files changed, 315 insertions(+), 12 deletions(-)
 create mode 100644 ext/XMLStringViewsExt.jl
 create mode 100644 src/lazynode.jl

diff --git a/Project.toml b/Project.toml
index 3a894e8..70e0bfb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,5 +3,15 @@ uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
 version = "0.3.8"
 authors = ["Josh Day <emailjoshday@gmail.com> and contributors"]
 
+[deps]
+Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[weakdeps]
+StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
+
+[extensions]
+XMLStringViewsExt = "StringViews"
+
 [compat]
-julia = "1.6"
+StringViews = "1"
+julia = "1.9"
diff --git a/ext/XMLStringViewsExt.jl b/ext/XMLStringViewsExt.jl
new file mode 100644
index 0000000..69d745e
--- /dev/null
+++ b/ext/XMLStringViewsExt.jl
@@ -0,0 +1,26 @@
+module XMLStringViewsExt
+
+using XML
+using StringViews: StringView
+using Mmap: Mmap
+
+"""
+    XML.mmap(filename, LazyNode) -> LazyNode
+
+Memory-map `filename` and return a `LazyNode` backed by a `StringView` over the mapped bytes.
+The file contents are not copied into Julia heap memory, making this suitable for very large
+XML files.
+
+Requires `using StringViews` to activate this method.
+"""
+function XML.mmap(filename::AbstractString, ::Type{XML.LazyNode})
+    bytes = open(filename) do io
+        Mmap.mmap(io)
+    end
+    sv = StringView(bytes)
+    XML.LazyNode(sv, 1, XML.Document)
+end
+
+Base.parse(xml::StringView, ::Type{XML.LazyNode}) = XML.LazyNode(xml, 1, XML.Document)
+
+end # module
diff --git a/src/XML.jl b/src/XML.jl
index 609fd96..16c0831 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -4,7 +4,7 @@ include("tokenizer.jl")
 using .XMLTokenizer
 
 export
-    Node, NodeType,
+    Node, LazyNode, NodeType,
     CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
     nodetype, tag, attributes, value, children,
     is_simple, simple_value,
@@ -215,6 +215,15 @@ function siblings(child::Node, root::Node)
 end
 
 include("xpath.jl")
+include("lazynode.jl")
+
+"""
+    XML.mmap(filename, LazyNode) -> LazyNode
+
+Memory-map `filename` and return a `LazyNode` backed by a `StringView`.
+Requires `using StringViews` to activate.
+"""
+function mmap end
 
 #-----------------------------------------------------------------------------# _to_node
 _to_node(n::Node{String}) = n
@@ -909,7 +918,6 @@ end
 
 #-----------------------------------------------------------------------------# deprecations
 Base.@deprecate_binding simplevalue simple_value false
-Base.@deprecate_binding LazyNode Node false
 
 # Removed types — informative errors
 struct Raw
diff --git a/src/lazynode.jl b/src/lazynode.jl
new file mode 100644
index 0000000..d68a176
--- /dev/null
+++ b/src/lazynode.jl
@@ -0,0 +1,258 @@
+#-----------------------------------------------------------------------------# LazyNode
+"""
+    LazyNode
+
+A lightweight, read-only view into an XML document that navigates the token stream on demand
+instead of building a full tree in memory.
+
+    doc = parse(xml_string, LazyNode)
+    doc = read("file.xml", LazyNode)
+
+Supports the same read-only interface as `Node`: [`nodetype`](@ref), [`tag`](@ref),
+[`attributes`](@ref), [`value`](@ref), [`children`](@ref), plus integer and string indexing.
+"""
+struct LazyNode{S <: AbstractString}
+    data::S
+    pos::Int
+    nodetype::NodeType
+end
+
+nodetype(n::LazyNode) = n.nodetype
+
+_lazy_tokenizer(n::LazyNode) = tokenize(n.data, n.pos)
+
+#-----------------------------------------------------------------------------# tag / value
+function tag(n::LazyNode)
+    nt = n.nodetype
+    if nt === Element
+        return String(tag_name(first(_lazy_tokenizer(n))))
+    elseif nt === ProcessingInstruction
+        return String(pi_target(first(_lazy_tokenizer(n))))
+    end
+    nothing
+end
+
+function value(n::LazyNode)
+    nt = n.nodetype
+    if nt === Text
+        return unescape(first(_lazy_tokenizer(n)).raw)
+    elseif nt === Comment
+        iter = _lazy_tokenizer(n)
+        iterate(iter)  # COMMENT_OPEN
+        return String(iterate(iter)[1].raw)
+    elseif nt === CData
+        iter = _lazy_tokenizer(n)
+        iterate(iter)  # CDATA_OPEN
+        return String(iterate(iter)[1].raw)
+    elseif nt === DTD
+        iter = _lazy_tokenizer(n)
+        iterate(iter)  # DOCTYPE_OPEN
+        return String(lstrip(iterate(iter)[1].raw))
+    elseif nt === ProcessingInstruction
+        iter = _lazy_tokenizer(n)
+        iterate(iter)  # PI_OPEN
+        result = iterate(iter)
+        result === nothing && return nothing
+        result[1].kind === TOKEN_PI_CONTENT || return nothing
+        content = strip(result[1].raw)
+        return isempty(content) ? nothing : String(content)
+    end
+    nothing
+end
+
+#-----------------------------------------------------------------------------# attributes
+function attributes(n::LazyNode)
+    n.nodetype in (Element, Declaration) || return nothing
+    iter = _lazy_tokenizer(n)
+    iterate(iter)  # skip OPEN_TAG or XML_DECL_OPEN
+    attrs = Pair{String,String}[]
+    for tok in iter
+        tok.kind === TOKEN_ATTR_NAME || break
+        name = String(tok.raw)
+        result = iterate(iter)
+        result === nothing && break
+        push!(attrs, name => unescape(attr_value(result[1])))
+    end
+    isempty(attrs) ? nothing : Dict(attrs)
+end
+
+function Base.get(n::LazyNode, key::AbstractString, default)
+    n.nodetype in (Element, Declaration) || return default
+    iter = _lazy_tokenizer(n)
+    iterate(iter)  # skip OPEN_TAG or XML_DECL_OPEN
+    for tok in iter
+        tok.kind === TOKEN_ATTR_NAME || return default
+        if tok.raw == key
+            result = iterate(iter)
+            result === nothing && return default
+            return unescape(attr_value(result[1]))
+        else
+            iterate(iter)  # skip value
+        end
+    end
+    default
+end
+
+function Base.getindex(n::LazyNode, key::AbstractString)
+    val = get(n, key, _MISSING_ATTR)
+    val === _MISSING_ATTR && throw(KeyError(key))
+    val
+end
+
+function Base.haskey(n::LazyNode, key::AbstractString)
+    get(n, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+function Base.keys(n::LazyNode)
+    n.nodetype in (Element, Declaration) || return ()
+    iter = _lazy_tokenizer(n)
+    iterate(iter)
+    result = String[]
+    for tok in iter
+        tok.kind === TOKEN_ATTR_NAME || break
+        push!(result, String(tok.raw))
+        iterate(iter)  # skip value
+    end
+    result
+end
+
+#-----------------------------------------------------------------------------# children
+function children(n::LazyNode{S}) where {S}
+    nt = n.nodetype
+    if nt === Document
+        return _lazy_collect_children(n.data, _lazy_tokenizer(n))
+    elseif nt !== Element
+        return ()
+    end
+    iter = _lazy_tokenizer(n)
+    for tok in iter
+        tok.kind === TOKEN_SELF_CLOSE && return LazyNode{S}[]
+        tok.kind === TOKEN_TAG_CLOSE && break
+    end
+    _lazy_collect_children(n.data, iter)
+end
+
+function _lazy_collect_children(data::S, iter) where {S <: AbstractString}
+    result = LazyNode{S}[]
+    for tok in iter
+        k = tok.kind
+        pos = tok.raw.offset + 1
+        if k === TOKEN_TEXT
+            push!(result, LazyNode(data, pos, Text))
+        elseif k === TOKEN_OPEN_TAG
+            push!(result, LazyNode(data, pos, Element))
+            _lazy_skip_element!(iter)
+        elseif k === TOKEN_COMMENT_OPEN
+            push!(result, LazyNode(data, pos, Comment))
+            _lazy_skip_until!(iter, TOKEN_COMMENT_CLOSE)
+        elseif k === TOKEN_CDATA_OPEN
+            push!(result, LazyNode(data, pos, CData))
+            _lazy_skip_until!(iter, TOKEN_CDATA_CLOSE)
+        elseif k === TOKEN_PI_OPEN
+            push!(result, LazyNode(data, pos, ProcessingInstruction))
+            _lazy_skip_until!(iter, TOKEN_PI_CLOSE)
+        elseif k === TOKEN_XML_DECL_OPEN
+            push!(result, LazyNode(data, pos, Declaration))
+            _lazy_skip_until!(iter, TOKEN_XML_DECL_CLOSE)
+        elseif k === TOKEN_DOCTYPE_OPEN
+            push!(result, LazyNode(data, pos, DTD))
+            _lazy_skip_until!(iter, TOKEN_DOCTYPE_CLOSE)
+        elseif k === TOKEN_CLOSE_TAG
+            break
+        end
+    end
+    result
+end
+
+function _lazy_skip_element!(iter)
+    depth = 1
+    for tok in iter
+        k = tok.kind
+        if k === TOKEN_OPEN_TAG
+            depth += 1
+        elseif k === TOKEN_SELF_CLOSE
+            depth -= 1
+            depth == 0 && return
+        elseif k === TOKEN_CLOSE_TAG
+            depth -= 1
+            if depth == 0
+                iterate(iter)  # consume trailing TAG_CLOSE
+                return
+            end
+        end
+    end
+end
+
+function _lazy_skip_until!(iter, target::TokenKind)
+    for tok in iter
+        tok.kind === target && return
+    end
+end
+
+#-----------------------------------------------------------------------------# is_simple / simple_value
+function is_simple(n::LazyNode)
+    n.nodetype === Element || return false
+    attrs = attributes(n)
+    (!isnothing(attrs) && !isempty(attrs)) && return false
+    ch = children(n)
+    length(ch) == 1 && ch[1].nodetype in (Text, CData)
+end
+
+simple_value(n::LazyNode) = is_simple(n) ? value(children(n)[1]) :
+    error("`simple_value` is only defined for simple nodes.")
+
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(n::LazyNode, i::Integer) = children(n)[i]
+Base.getindex(n::LazyNode, ::Colon) = children(n)
+Base.lastindex(n::LazyNode) = lastindex(children(n))
+Base.only(n::LazyNode) = only(children(n))
+Base.length(n::LazyNode) = length(children(n))
+
+#-----------------------------------------------------------------------------# parse / read
+Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode)
+Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), 1, Document)
+
+Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode)
+Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode)
+
+#-----------------------------------------------------------------------------# show
+function Base.show(io::IO, n::LazyNode)
+    nt = n.nodetype
+    printstyled(io, "Lazy "; color=:light_black)
+    printstyled(io, nt; color=:light_green)
+    if nt === Text
+        printstyled(io, ' ', repr(value(n)))
+    elseif nt === Element
+        printstyled(io, " <", tag(n); color=:light_cyan)
+        for k in keys(n)
+            print(io, ' ', k, '=', '"', get(n, k, ""), '"')
+        end
+        printstyled(io, '>'; color=:light_cyan)
+    elseif nt === DTD
+        printstyled(io, " <!DOCTYPE "; color=:light_cyan)
+        printstyled(io, value(n); color=:light_black)
+        printstyled(io, '>'; color=:light_cyan)
+    elseif nt === Declaration
+        printstyled(io, " <?xml"; color=:light_cyan)
+        for k in keys(n)
+            print(io, ' ', k, '=', '"', get(n, k, ""), '"')
+        end
+        printstyled(io, "?>"; color=:light_cyan)
+    elseif nt === ProcessingInstruction
+        printstyled(io, " <?", tag(n); color=:light_cyan)
+        v = value(n)
+        !isnothing(v) && print(io, ' ', v)
+        printstyled(io, "?>"; color=:light_cyan)
+    elseif nt === Comment
+        printstyled(io, " <!--"; color=:light_cyan)
+        printstyled(io, value(n); color=:light_black)
+        printstyled(io, "-->"; color=:light_cyan)
+    elseif nt === CData
+        printstyled(io, " <![CDATA["; color=:light_cyan)
+        printstyled(io, value(n); color=:light_black)
+        printstyled(io, "]]>"; color=:light_cyan)
+    elseif nt === Document
+        n_ch = length(children(n))
+        n_ch > 0 && printstyled(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)"; color=:light_black)
+    end
+end
diff --git a/src/tokenizer.jl b/src/tokenizer.jl
index 355036d..be8598f 100644
--- a/src/tokenizer.jl
+++ b/src/tokenizer.jl
@@ -65,9 +65,9 @@ export tokenize, tag_name, attr_value, pi_target, TokenKind, Token,
 end
 
 #-----------------------------------------------------------------------# Token
-struct Token
+struct Token{S <: AbstractString}
     kind::TokenKind
-    raw::SubString{String}
+    raw::SubString{S}
 end
 
 function Base.show(io::IO, t::Token)
@@ -89,22 +89,23 @@ end
 end
 
 #-----------------------------------------------------------------------# Tokenizer
-mutable struct Tokenizer
-    const data::String
+mutable struct Tokenizer{S <: AbstractString}
+    const data::S
     pos::Int
     state::_State
-    pending::Union{Token,Nothing}
+    pending::Union{Token{S},Nothing}
 end
 
 """
-    tokenize(xml::String) -> Tokenizer
+    tokenize(xml::AbstractString) -> Tokenizer
 
 Return a lazy iterator of `Token`s over the XML string `xml`.
 """
-tokenize(xml::String) = Tokenizer(xml, 1, _S_DEFAULT, nothing)
+tokenize(xml::AbstractString) = Tokenizer(xml, 1, _S_DEFAULT, nothing)
+tokenize(xml::AbstractString, pos::Int) = Tokenizer(xml, pos, _S_DEFAULT, nothing)
 
-Base.IteratorSize(::Type{Tokenizer}) = Base.SizeUnknown()
-Base.eltype(::Type{Tokenizer}) = Token
+Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S}
 
 function Base.iterate(t::Tokenizer, _=nothing)
     tok = _next_token!(t)

From d011424b5c23494f9d038accc77396ff59de4ab0 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 5 Mar 2026 21:58:06 -0500
Subject: [PATCH 09/18] Refactor simple_value checks and use direct attrs
 iteration

---
 benchmarks/XMarkGenerator.jl | 36 ++++++++++++++++--------------------
 src/lazynode.jl              | 24 ++++++++++++++++++------
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl
index f0b497d..7f780a0 100644
--- a/benchmarks/XMarkGenerator.jl
+++ b/benchmarks/XMarkGenerator.jl
@@ -85,10 +85,6 @@ rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0'))
 rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ")
 rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com")
 
-function rand_id(prefix, n)
-    string(prefix, n)
-end
-
 #-----------------------------------------------------------------# XML writing helpers
 function xml_escape_char(io::IO, c::Char)
     if c == '&';     print(io, "&amp;")
@@ -143,7 +139,7 @@ end
 
 function write_annotation(rng, io, indent, n_people)
     println(io, indent, "<annotation>")
-    println(io, indent, "  <author person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+    println(io, indent, "  <author person=\"", string("person",rand(rng, 1:n_people)), "\"/>")
     write_description(rng, io, string(indent, "  "))
     println(io, indent, "  <happiness>", rand(rng, 1:10), "</happiness>")
     println(io, indent, "</annotation>")
@@ -152,7 +148,7 @@ end
 #-----------------------------------------------------------------# Section writers
 function write_item(rng, io, id, n_categories)
     featured = rand(rng) < 0.1 ? " featured=\"yes\"" : ""
-    println(io, "      <item id=\"", rand_id("item", id), "\"", featured, ">")
+    println(io, "      <item id=\"", string("item",id), "\"", featured, ">")
     println(io, "        <location>", rand(rng, CITIES), "</location>")
     println(io, "        <quantity>", rand(rng, 1:50), "</quantity>")
     println(io, "        <name>", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), "</name>")
@@ -160,7 +156,7 @@ function write_item(rng, io, id, n_categories)
     write_description(rng, io, "        ")
     println(io, "        <shipping>", rand(rng, SHIPPING), "</shipping>")
     for _ in 1:rand(rng, 1:3)
-        println(io, "        <incategory category=\"", rand_id("category", rand(rng, 1:n_categories)), "\"/>")
+        println(io, "        <incategory category=\"", string("category",rand(rng, 1:n_categories)), "\"/>")
     end
     println(io, "        <mailbox>")
     for _ in 1:rand(rng, 0:5)
@@ -180,7 +176,7 @@ end
 function write_categories(rng, io, n)
     println(io, "  <categories>")
     for i in 1:n
-        println(io, "    <category id=\"", rand_id("category", i), "\">")
+        println(io, "    <category id=\"", string("category",i), "\">")
         println(io, "      <name>", rand_word(rng), " ", rand_word(rng), "</name>")
         write_description(rng, io, "      ")
         println(io, "    </category>")
@@ -191,8 +187,8 @@ end
 function write_catgraph(rng, io, n_edges, n_categories)
     println(io, "  <catgraph>")
     for _ in 1:n_edges
-        from = rand_id("category", rand(rng, 1:n_categories))
-        to = rand_id("category", rand(rng, 1:n_categories))
+        from = string("category",rand(rng, 1:n_categories))
+        to = string("category",rand(rng, 1:n_categories))
         println(io, "    <edge from=\"", from, "\" to=\"", to, "\"/>")
     end
     println(io, "  </catgraph>")
@@ -201,7 +197,7 @@ end
 function write_people(rng, io, n, n_categories, n_open)
     println(io, "  <people>")
     for i in 1:n
-        println(io, "    <person id=\"", rand_id("person", i), "\">")
+        println(io, "    <person id=\"", string("person",i), "\">")
         println(io, "      <name>", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), "</name>")
         println(io, "      <emailaddress>", rand_email(rng), "</emailaddress>")
         if rand(rng) < 0.8
@@ -229,7 +225,7 @@ function write_people(rng, io, n, n_categories, n_open)
             income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : ""
             println(io, "      <profile", income, ">")
             for _ in 1:rand(rng, 0:4)
-                println(io, "        <interest category=\"", rand_id("category", rand(rng, 1:n_categories)), "\"/>")
+                println(io, "        <interest category=\"", string("category",rand(rng, 1:n_categories)), "\"/>")
             end
             if rand(rng) < 0.8
                 println(io, "        <education>", rand(rng, EDUCATIONS), "</education>")
@@ -246,7 +242,7 @@ function write_people(rng, io, n, n_categories, n_open)
         if n_open > 0 && rand(rng) < 0.3
             println(io, "      <watches>")
             for _ in 1:rand(rng, 1:5)
-                println(io, "        <watch open_auction=\"", rand_id("open_auction", rand(rng, 1:n_open)), "\"/>")
+                println(io, "        <watch open_auction=\"", string("open_auction",rand(rng, 1:n_open)), "\"/>")
             end
             println(io, "      </watches>")
         end
@@ -258,7 +254,7 @@ end
 function write_open_auctions(rng, io, n, n_items, n_people)
     println(io, "  <open_auctions>")
     for i in 1:n
-        println(io, "    <open_auction id=\"", rand_id("open_auction", i), "\">")
+        println(io, "    <open_auction id=\"", string("open_auction",i), "\">")
         println(io, "      <initial>", rand_price(rng), "</initial>")
         if rand(rng) < 0.5
             println(io, "      <reserve>", rand_price(rng), "</reserve>")
@@ -267,7 +263,7 @@ function write_open_auctions(rng, io, n, n_items, n_people)
             println(io, "      <bidder>")
             println(io, "        <date>", rand_date(rng), "</date>")
             println(io, "        <time>", rand_time(rng), "</time>")
-            println(io, "        <personref person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+            println(io, "        <personref person=\"", string("person",rand(rng, 1:n_people)), "\"/>")
             println(io, "        <increase>", rand_price(rng), "</increase>")
             println(io, "      </bidder>")
         end
@@ -275,8 +271,8 @@ function write_open_auctions(rng, io, n, n_items, n_people)
         if rand(rng) < 0.3
             println(io, "      <privacy>", rand(rng, ["Yes", "No"]), "</privacy>")
         end
-        println(io, "      <itemref item=\"", rand_id("item", rand(rng, 1:n_items)), "\"/>")
-        println(io, "      <seller person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+        println(io, "      <itemref item=\"", string("item",rand(rng, 1:n_items)), "\"/>")
+        println(io, "      <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>")
         write_annotation(rng, io, "      ", n_people)
         println(io, "      <quantity>", rand(rng, 1:10), "</quantity>")
         println(io, "      <type>", rand(rng, ["Regular", "Featured"]), "</type>")
@@ -293,12 +289,12 @@ function write_closed_auctions(rng, io, n, n_open, n_items, n_people)
     println(io, "  <closed_auctions>")
     for i in 1:n
         println(io, "    <closed_auction>")
-        println(io, "      <seller person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
-        println(io, "      <buyer person=\"", rand_id("person", rand(rng, 1:n_people)), "\"/>")
+        println(io, "      <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>")
+        println(io, "      <buyer person=\"", string("person",rand(rng, 1:n_people)), "\"/>")
         # Use item IDs that don't overlap with open auctions
         item_id = n_open + i
         item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items)
-        println(io, "      <itemref item=\"", rand_id("item", item_id), "\"/>")
+        println(io, "      <itemref item=\"", string("item",item_id), "\"/>")
         println(io, "      <price>", rand_price(rng), "</price>")
         println(io, "      <date>", rand_date(rng), "</date>")
         println(io, "      <quantity>", rand(rng, 1:10), "</quantity>")
diff --git a/src/lazynode.jl b/src/lazynode.jl
index d68a176..1eeb1a9 100644
--- a/src/lazynode.jl
+++ b/src/lazynode.jl
@@ -198,8 +198,14 @@ function is_simple(n::LazyNode)
     length(ch) == 1 && ch[1].nodetype in (Text, CData)
 end
 
-simple_value(n::LazyNode) = is_simple(n) ? value(children(n)[1]) :
-    error("`simple_value` is only defined for simple nodes.")
+function simple_value(n::LazyNode)
+    n.nodetype === Element || error("`simple_value` is only defined for simple nodes.")
+    attrs = attributes(n)
+    (!isnothing(attrs) && !isempty(attrs)) && error("`simple_value` is only defined for simple nodes.")
+    ch = children(n)
+    length(ch) == 1 && ch[1].nodetype in (Text, CData) || error("`simple_value` is only defined for simple nodes.")
+    value(ch[1])
+end
 
 #-----------------------------------------------------------------------------# indexing
 Base.getindex(n::LazyNode, i::Integer) = children(n)[i]
@@ -224,8 +230,11 @@ function Base.show(io::IO, n::LazyNode)
         printstyled(io, ' ', repr(value(n)))
     elseif nt === Element
         printstyled(io, " <", tag(n); color=:light_cyan)
-        for k in keys(n)
-            print(io, ' ', k, '=', '"', get(n, k, ""), '"')
+        attrs = attributes(n)
+        if !isnothing(attrs)
+            for (k, v) in attrs
+                print(io, ' ', k, '=', '"', v, '"')
+            end
         end
         printstyled(io, '>'; color=:light_cyan)
     elseif nt === DTD
@@ -234,8 +243,11 @@ function Base.show(io::IO, n::LazyNode)
         printstyled(io, '>'; color=:light_cyan)
     elseif nt === Declaration
         printstyled(io, " <?xml"; color=:light_cyan)
-        for k in keys(n)
-            print(io, ' ', k, '=', '"', get(n, k, ""), '"')
+        attrs = attributes(n)
+        if !isnothing(attrs)
+            for (k, v) in attrs
+                print(io, ' ', k, '=', '"', v, '"')
+            end
         end
         printstyled(io, "?>"; color=:light_cyan)
     elseif nt === ProcessingInstruction

From 754f8fab7ca43187d7e61d8850ebb6f213704faa Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Fri, 6 Mar 2026 12:11:15 -0500
Subject: [PATCH 10/18] Refactor tokenizer into XMLTokenizer and add LazyNode

---
 README.md                |  98 +++++---
 benchmarks/benchmarks.jl |   4 +
 ext/XMLStringViewsExt.jl |   4 +-
 src/XML.jl               |  48 +++-
 src/XMLTokenizer.jl      | 496 +++++++++++++++++++++++++++++++++++++++
 src/lazynode.jl          |  31 +--
 src/tokenizer.jl         | 481 -------------------------------------
 7 files changed, 632 insertions(+), 530 deletions(-)
 create mode 100644 src/XMLTokenizer.jl
 delete mode 100644 src/tokenizer.jl

diff --git a/README.md b/README.md
index be72afc..f59e00d 100644
--- a/README.md
+++ b/README.md
@@ -176,60 +176,104 @@ For large files or when you need fine-grained control, `XML.XMLTokenizer` provid
 using XML.XMLTokenizer
 
 for token in tokenize("<root><child attr=\"val\">text</child></root>")
-    println(token.kind, " => ", repr(token.raw))
+    println(token.kind, " => ", repr(String(token.raw)))
 end
-# TOKEN_OPEN_TAG_START => "<root"
-# TOKEN_CLOSE_TAG => ">"
-# TOKEN_OPEN_TAG_START => "<child"
+# TOKEN_OPEN_TAG => "<root"
+# TOKEN_TAG_CLOSE => ">"
+# TOKEN_OPEN_TAG => "<child"
 # TOKEN_ATTR_NAME => "attr"
 # TOKEN_ATTR_VALUE => "\"val\""
-# TOKEN_CLOSE_TAG => ">"
+# TOKEN_TAG_CLOSE => ">"
 # TOKEN_TEXT => "text"
-# TOKEN_END_TAG => "</child>"
-# TOKEN_END_TAG => "</root>"
+# TOKEN_CLOSE_TAG => "</child"
+# TOKEN_TAG_CLOSE => ">"
+# TOKEN_CLOSE_TAG => "</root"
+# TOKEN_TAG_CLOSE => ">"
 ```
 
 <br>
 
-# Escaping
+# `LazyNode`
 
-XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, `'`) for you. Use the provided utility functions:
+For read-only access without building a full DOM tree, use `LazyNode`. It stores only a reference to the source string and re-tokenizes on demand, using significantly less memory:
 
-- `XML.escape(::String)` / `XML.unescape(::String)` -- transform strings.
-- `XML.escape!(::Node)` / `XML.unescape!(::Node)` -- transform an entire node tree in-place.
+```julia
+doc = parse(xml_string, LazyNode)
+doc = read("file.xml", LazyNode)
+```
+
+`LazyNode` supports the same read-only interface as `Node`: `nodetype`, `tag`, `attributes`, `value`, `children`, `is_simple`, `simple_value`, plus integer and string indexing.
+
+### Memory-mapped files
+
+For very large files, combine `LazyNode` with memory mapping via the `StringViews` extension:
+
+```julia
+using XML, StringViews
+
+doc = XML.mmap("very_large.xml", LazyNode)
+```
 
 <br>
 
 # Benchmarks
 
-Details in the benchmark file: [benchmarks.jl](benchmarks/benchmarks.jl)
+Benchmark source: [benchmarks.jl](benchmarks/benchmarks.jl).  Test data: `books.xml` (small, ~4 KB) and a generated XMark auction XML (medium, ~14 MB).
+
 
 
 ```
-============================================================
-  BENCHMARK RESULTS
-============================================================
+                         Parse (small) — median time (ms)
+
+        XML.jl  ■■■■■■■ 0.041
+   XML.jl (SS)  ■■■■■■ 0.034
+         EzXML  ■■■■■ 0.030
+      LightXML  ■■■■■■ 0.033
+       XMLDict  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.232
 
-                      Parse (small) — median time (ms)
 
-     XML.jl  ■■■■■■ 0.031916
-      EzXML  ■■■■ 0.025125
-   LightXML  ■■■■■ 0.029959
-    XMLDict  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.23125
+                         Parse (medium) — median time (ms)
+
+        XML.jl  ■■■■■■■■■■■■ 194.2
+   XML.jl (SS)  ■■■■■■■■■■ 172.8
+         EzXML  ■■■■■■ 105.8
+      LightXML  ■■■■■■ 105.0
+       XMLDict  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 687.7
 
 
                       Write (small) — median time (ms)
 
-     XML.jl  ■■■■■■■■■■■■ 0.027667
-      EzXML  ■■■■ 0.010375
-   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.093584
+     XML.jl  ■■■■■■■■ 0.021
+      EzXML  ■■■■ 0.012
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.110
+
+
+                      Write (medium) — median time (ms)
+
+     XML.jl  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 93.2
+      EzXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 84.6
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 60.4
+
+
+                        Read file — median time (ms)
+
+     XML.jl  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 214.1
+      EzXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 143.1
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■ 121.9
+
+
+                   Collect tags (small) — median time (ms)
+
+     XML.jl  ■■■■■■ 0.000698
+      EzXML  ■■■■■■■■■■■■■■■■■■■■■■■ 0.00255
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00430
 
 
-                       Collect tags — median time (ms)
+                  Collect tags (medium) — median time (ms)
 
-     XML.jl  ■■■■■■ 0.000579122
-      EzXML  ■■■■■■■■■■■■■■■■■■■■■■ 0.0021084
-   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00360413
+     XML.jl  ■■■■■■■■■■■■■■■■■■■ 12.6
+      EzXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 20.5
+   LightXML  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 27.6
 ```
 
 ```julia
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
index 7aecc76..539fce0 100644
--- a/benchmarks/benchmarks.jl
+++ b/benchmarks/benchmarks.jl
@@ -37,14 +37,18 @@ macro add_benchmark(kind, name, expr...)
     end))
 end
 
+const SSNode = Node{SubString{String}}
+
 #-----------------------------------------------------------------------------# Parse (small)
 @add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node)
+@add_benchmark "Parse (small)" "XML.jl (SS)" parse($small_xml, SSNode)
 @add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml)
 @add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml)
 @add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml)
 
 #-----------------------------------------------------------------------------# Parse (medium)
 @add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node)
+@add_benchmark "Parse (medium)" "XML.jl (SS)" parse($medium_xml, SSNode)
 @add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml)
 @add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml)
 @add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml)
diff --git a/ext/XMLStringViewsExt.jl b/ext/XMLStringViewsExt.jl
index 69d745e..9af8b4d 100644
--- a/ext/XMLStringViewsExt.jl
+++ b/ext/XMLStringViewsExt.jl
@@ -18,9 +18,9 @@ function XML.mmap(filename::AbstractString, ::Type{XML.LazyNode})
         Mmap.mmap(io)
     end
     sv = StringView(bytes)
-    XML.LazyNode(sv, 1, XML.Document)
+    XML.LazyNode(sv, XML.Document)
 end
 
-Base.parse(xml::StringView, ::Type{XML.LazyNode}) = XML.LazyNode(xml, 1, XML.Document)
+Base.parse(xml::StringView, ::Type{XML.LazyNode}) = XML.LazyNode(xml, XML.Document)
 
 end # module
diff --git a/src/XML.jl b/src/XML.jl
index 16c0831..cf450d1 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -1,8 +1,5 @@
 module XML
 
-include("tokenizer.jl")
-using .XMLTokenizer
-
 export
     Node, LazyNode, NodeType,
     CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
@@ -12,6 +9,9 @@ export
     xpath,
     h
 
+include("XMLTokenizer.jl")
+using .XMLTokenizer
+
 #-----------------------------------------------------------------------------# escape/unescape
 const escape_chars = ('&' => "&amp;", '<' => "&lt;", '>' => "&gt;", '\'' => "&apos;", '"' => "&quot;")
 
@@ -395,18 +395,54 @@ function Base.show(io::IO, o::Node)
 end
 
 #-----------------------------------------------------------------------------# show (text/xml)
+
+# Write XML-escaped content directly to IO (single pass, no intermediate string)
+function _write_escaped(io::IO, s::String)
+    start = 1
+    i = 1
+    n = ncodeunits(s)
+    @inbounds while i <= n
+        b = codeunit(s, i)
+        esc = if b == UInt8('&'); "&amp;"
+        elseif b == UInt8('<'); "&lt;"
+        elseif b == UInt8('>'); "&gt;"
+        elseif b == UInt8('"'); "&quot;"
+        elseif b == UInt8('\''); "&apos;"
+        else
+            i += 1
+            continue
+        end
+        i > start && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (i - start) % UInt)
+        print(io, esc)
+        i += 1
+        start = i
+    end
+    start <= n && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (n - start + 1) % UInt)
+    nothing
+end
+
+# Cached indentation strings to avoid repeated allocation
+const _MAX_CACHED_INDENT = 64
+const _INDENT_STRINGS = [" " ^ n for n in 0:_MAX_CACHED_INDENT]
+@inline function _indent_str(n::Int)
+    0 <= n <= _MAX_CACHED_INDENT && return @inbounds _INDENT_STRINGS[n + 1]
+    " " ^ n
+end
+
 function _print_attrs(io::IO, attributes)
     isnothing(attributes) && return
     for (k, v) in attributes
-        print(io, ' ', k, '=', '"', escape(v), '"')
+        print(io, ' ', k, "=\"")
+        _write_escaped(io, v)
+        print(io, '"')
     end
 end
 
 function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false)
-    pad = preserve ? "" : ' ' ^ (indent * depth)
+    pad = preserve ? "" : _indent_str(indent * depth)
     nt = node.nodetype
     if nt === Text
-        print(io, escape(node.value))
+        _write_escaped(io, node.value)
     elseif nt === Element
         # Check xml:space on this element
         child_preserve = preserve
diff --git a/src/XMLTokenizer.jl b/src/XMLTokenizer.jl
new file mode 100644
index 0000000..0f6c9d4
--- /dev/null
+++ b/src/XMLTokenizer.jl
@@ -0,0 +1,496 @@
+"""
+    XMLTokenizer
+
+A self-contained module for tokenizing XML documents into a fine-grained stream of tokens.
+
+# Usage
+
+```julia
+using .XMLTokenizer: tokenize, tag_name, attr_value, pi_target
+
+for token in tokenize(\"\"\"<?xml version="1.0"?><root attr="val">text<!-- comment --></root>\"\"\")
+    println(token)
+end
+```
+"""
+module XMLTokenizer
+
+export tokenize, tag_name, attr_value, pi_target, TokenKind, Token,
+    TOKEN_TEXT,
+    TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
+    TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+    TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
+    TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
+    TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
+    TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
+    TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
+
+#-----------------------------------------------------------------------# TokenKind
+@enum TokenKind::UInt8 begin
+    # Character data
+    TOKEN_TEXT               # text content between markup
+
+    # Element tags
+    TOKEN_OPEN_TAG           # <name
+    TOKEN_CLOSE_TAG          # </name
+    TOKEN_TAG_CLOSE          # >
+    TOKEN_SELF_CLOSE         # />
+    TOKEN_ATTR_NAME          # attribute name
+    TOKEN_ATTR_VALUE         # "value" or 'value' (with quotes in raw)
+
+    # CDATA sections
+    TOKEN_CDATA_OPEN         # <![CDATA[
+    TOKEN_CDATA_CONTENT      # raw text content
+    TOKEN_CDATA_CLOSE        # ]]>
+
+    # Comments
+    TOKEN_COMMENT_OPEN       # <!--
+    TOKEN_COMMENT_CONTENT    # comment text
+    TOKEN_COMMENT_CLOSE      # -->
+
+    # Processing instructions
+    TOKEN_PI_OPEN            # <?target (includes target name)
+    TOKEN_PI_CONTENT         # PI body text
+    TOKEN_PI_CLOSE           # ?>
+
+    # XML declaration (<?xml ...?>)
+    TOKEN_XML_DECL_OPEN      # <?xml
+    TOKEN_XML_DECL_CLOSE     # ?>
+    # (reuses TOKEN_ATTR_NAME / TOKEN_ATTR_VALUE for pseudo-attributes)
+
+    # DOCTYPE
+    TOKEN_DOCTYPE_OPEN       # <!DOCTYPE (or other <! declarations)
+    TOKEN_DOCTYPE_CONTENT    # declaration body
+    TOKEN_DOCTYPE_CLOSE      # >
+end
+
+#-----------------------------------------------------------------------# Token
+struct Token{S <: AbstractString}
+    kind::TokenKind
+    raw::SubString{S}
+end
+
+function Base.show(io::IO, t::Token)
+    print(io, t.kind, ": ", repr(String(t.raw)))
+end
+
+#-----------------------------------------------------------------------# Tokenizer state
+@enum _State::UInt8 begin
+    _S_DEFAULT            # normal content mode
+    _S_TAG                # inside open tag, reading attributes
+    _S_TAG_VALUE          # expecting quoted attribute value
+    _S_CLOSE_TAG          # inside close tag, expecting >
+    _S_XML_DECL           # inside <?xml, reading pseudo-attributes
+    _S_XML_DECL_VALUE     # expecting quoted attr value in xml decl
+    _S_COMMENT            # after <!--, reading content
+    _S_CDATA              # after <![CDATA[, reading content
+    _S_PI                 # after <?target, reading content
+    _S_DOCTYPE            # after <!DOCTYPE, reading content
+end
+
+#-----------------------------------------------------------------------# TokenizerState (immutable, SROA-friendly)
+struct TokenizerState{S <: AbstractString}
+    pos::Int
+    state::_State
+    pending::Token{S}
+end
+
+@inline _no_token(s::AbstractString) = Token(TOKEN_TEXT, @inbounds SubString(s, 1, 0))
+@inline _has_pending(st::TokenizerState) = !isempty(st.pending.raw)
+
+@inline function _init_state(data::AbstractString, pos::Int=1)
+    TokenizerState(pos, _S_DEFAULT, _no_token(data))
+end
+
+#-----------------------------------------------------------------------# Tokenizer (immutable iterator)
+"""
+    tokenize(xml::AbstractString) -> Tokenizer
+
+Return a lazy iterator of `Token`s over the XML string `xml`.
+"""
+struct Tokenizer{S <: AbstractString}
+    data::S
+    start::Int
+end
+
+tokenize(xml::AbstractString) = Tokenizer(xml, 1)
+tokenize(xml::AbstractString, pos::Int) = Iterators.Stateful(Tokenizer(xml, pos))
+
+function Base.show(io::IO, t::Tokenizer)
+    n = ncodeunits(t.data)
+    print(io, "Tokenizer(")
+    t.start > 1 && print(io, t.start, "/")
+    print(io, n, " bytes)")
+end
+
+Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S}
+
+function Base.iterate(t::Tokenizer, st::TokenizerState=_init_state(t.data, t.start))
+    result = _next_token(t.data, st)
+    result === nothing ? nothing : result
+end
+
+#-----------------------------------------------------------------------# Internal helpers
+@inline _iseof(data, pos) = pos > ncodeunits(data)
+@inline _peek(data, pos) = @inbounds codeunit(data, pos)
+@inline _canpeek(data, pos, offset) = pos + offset <= ncodeunits(data)
+
+@inline function _is_name_byte(b::UInt8)
+    (UInt8('a') <= b <= UInt8('z')) || (UInt8('A') <= b <= UInt8('Z')) ||
+    (UInt8('0') <= b <= UInt8('9')) || b == UInt8('_') || b == UInt8('-') ||
+    b == UInt8('.') || b == UInt8(':')
+end
+
+@inline function _is_whitespace(b::UInt8)
+    b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r')
+end
+
+@inline function _skip_whitespace(data, pos)
+    while !_iseof(data, pos) && _is_whitespace(_peek(data, pos))
+        pos += 1
+    end
+    pos
+end
+
+function _skip_quoted(data, pos)
+    q = _peek(data, pos)
+    pos += 1
+    while !_iseof(data, pos)
+        _peek(data, pos) == q && return pos + 1
+        pos += 1
+    end
+    error("Unterminated quoted string")
+end
+
+@noinline _err(msg, pos) = throw(ArgumentError("XML tokenizer error at position $pos: $msg"))
+
+#-----------------------------------------------------------------------# Main dispatch
+function _next_token(data, st::TokenizerState)
+    if _has_pending(st)
+        return (st.pending, TokenizerState(st.pos, st.state, _no_token(data)))
+    end
+    pos = st.pos
+    _iseof(data, pos) && return nothing
+
+    s = st.state
+    if s == _S_DEFAULT
+        _peek(data, pos) == UInt8('<') ? _read_markup(data, pos) : _read_text(data, pos)
+    elseif s == _S_TAG || s == _S_XML_DECL
+        _read_in_tag(data, pos, s)
+    elseif s == _S_TAG_VALUE || s == _S_XML_DECL_VALUE
+        _read_attr_value(data, pos, s)
+    elseif s == _S_CLOSE_TAG
+        _read_close_tag_end(data, pos)
+    elseif s == _S_COMMENT
+        _read_comment_body(data, pos)
+    elseif s == _S_CDATA
+        _read_cdata_body(data, pos)
+    elseif s == _S_PI
+        _read_pi_body(data, pos)
+    else  # _S_DOCTYPE
+        _read_doctype_body(data, pos)
+    end
+end
+
+#-----------------------------------------------------------------------# S_DEFAULT tokens
+function _read_text(data, pos)
+    start = pos
+    while !_iseof(data, pos) && _peek(data, pos) != UInt8('<')
+        pos += 1
+    end
+    tok = Token(TOKEN_TEXT, @inbounds SubString(data, start, prevind(data, pos)))
+    (tok, TokenizerState(pos, _S_DEFAULT, _no_token(data)))
+end
+
+function _read_markup(data, pos)
+    start = pos
+    pos += 1  # skip '<'
+    _iseof(data, pos) && _err("unexpected end of input after '<'", start)
+
+    b = _peek(data, pos)
+    if b == UInt8('!')
+        _read_bang(data, pos + 1, start)
+    elseif b == UInt8('?')
+        _read_pi_start(data, pos + 1, start)
+    elseif b == UInt8('/')
+        _read_close_tag_start(data, pos + 1, start)
+    else
+        _read_open_tag_start(data, pos, start)
+    end
+end
+
+#-----------------------------------------------------------------------# <! dispatch
+function _read_bang(data, pos, start)
+    # Comment: <!--
+    if !_iseof(data, pos) && _peek(data, pos) == UInt8('-')
+        pos += 1
+        (!_iseof(data, pos) && _peek(data, pos) == UInt8('-')) || _err("expected '<!--'", start)
+        pos += 1
+        tok = Token(TOKEN_COMMENT_OPEN, @inbounds SubString(data, start, pos - 1))
+        return (tok, TokenizerState(pos, _S_COMMENT, _no_token(data)))
+    end
+
+    # CDATA: <![CDATA[
+    if !_iseof(data, pos) && _peek(data, pos) == UInt8('[')
+        pos += 1
+        for expected in (UInt8('C'), UInt8('D'), UInt8('A'), UInt8('T'), UInt8('A'), UInt8('['))
+            _iseof(data, pos) && _err("unterminated CDATA", start)
+            _peek(data, pos) == expected || _err("invalid CDATA section", start)
+            pos += 1
+        end
+        tok = Token(TOKEN_CDATA_OPEN, @inbounds SubString(data, start, pos - 1))
+        return (tok, TokenizerState(pos, _S_CDATA, _no_token(data)))
+    end
+
+    # <!DOCTYPE ...> or other <! declaration
+    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+        pos += 1
+    end
+    tok = Token(TOKEN_DOCTYPE_OPEN, @inbounds SubString(data, start, pos - 1))
+    (tok, TokenizerState(pos, _S_DOCTYPE, _no_token(data)))
+end
+
+#-----------------------------------------------------------------------# <? (PI / XML declaration)
+function _read_pi_start(data, pos, start)
+    name_start = pos
+    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+        pos += 1
+    end
+
+    is_xml = (pos - name_start == 3) &&
+        codeunit(data, name_start)     == UInt8('x') &&
+        codeunit(data, name_start + 1) == UInt8('m') &&
+        codeunit(data, name_start + 2) == UInt8('l')
+
+    if is_xml
+        tok = Token(TOKEN_XML_DECL_OPEN, @inbounds SubString(data, start, pos - 1))
+        (tok, TokenizerState(pos, _S_XML_DECL, _no_token(data)))
+    else
+        tok = Token(TOKEN_PI_OPEN, @inbounds SubString(data, start, pos - 1))
+        (tok, TokenizerState(pos, _S_PI, _no_token(data)))
+    end
+end
+
+#-----------------------------------------------------------------------# Tags
+function _read_open_tag_start(data, pos, start)
+    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+        pos += 1
+    end
+    tok = Token(TOKEN_OPEN_TAG, @inbounds SubString(data, start, pos - 1))
+    (tok, TokenizerState(pos, _S_TAG, _no_token(data)))
+end
+
+function _read_close_tag_start(data, pos, start)
+    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+        pos += 1
+    end
+    tok = Token(TOKEN_CLOSE_TAG, @inbounds SubString(data, start, pos - 1))
+    (tok, TokenizerState(pos, _S_CLOSE_TAG, _no_token(data)))
+end
+
+function _read_close_tag_end(data, pos)
+    pos = _skip_whitespace(data, pos)
+    _iseof(data, pos) && _err("unterminated close tag", pos)
+    _peek(data, pos) == UInt8('>') || _err("expected '>'", pos)
+    tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
+    (tok, TokenizerState(pos + 1, _S_DEFAULT, _no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL)
+function _read_in_tag(data, pos, state)
+    pos = _skip_whitespace(data, pos)
+    _iseof(data, pos) && _err("unterminated tag", pos)
+
+    b = _peek(data, pos)
+    is_decl = (state == _S_XML_DECL)
+
+    # Check for end delimiters
+    if is_decl
+        if b == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+            tok = Token(TOKEN_XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1))
+            return (tok, TokenizerState(pos + 2, _S_DEFAULT, _no_token(data)))
+        end
+    else
+        if b == UInt8('>')
+            tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
+            return (tok, TokenizerState(pos + 1, _S_DEFAULT, _no_token(data)))
+        end
+        if b == UInt8('/') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+            tok = Token(TOKEN_SELF_CLOSE, @inbounds SubString(data, pos, pos + 1))
+            return (tok, TokenizerState(pos + 2, _S_DEFAULT, _no_token(data)))
+        end
+    end
+
+    # Attribute name
+    name_start = pos
+    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+        pos += 1
+    end
+    name_end = pos - 1
+    name_start > name_end && _err("expected attribute name or tag close", pos)
+
+    # Consume '=' and surrounding whitespace (not part of any token)
+    pos = _skip_whitespace(data, pos)
+    (!_iseof(data, pos) && _peek(data, pos) == UInt8('=')) || _err("expected '=' after attribute name", pos)
+    pos += 1
+    pos = _skip_whitespace(data, pos)
+
+    next_state = is_decl ? _S_XML_DECL_VALUE : _S_TAG_VALUE
+    tok = Token(TOKEN_ATTR_NAME, @inbounds SubString(data, name_start, name_end))
+    (tok, TokenizerState(pos, next_state, _no_token(data)))
+end
+
+function _read_attr_value(data, pos, state)
+    _iseof(data, pos) && _err("expected attribute value", pos)
+
+    q = _peek(data, pos)
+    (q == UInt8('"') || q == UInt8('\'')) || _err("expected quoted attribute value", pos)
+
+    start = pos
+    pos += 1  # skip opening quote
+    while !_iseof(data, pos) && _peek(data, pos) != q
+        pos += 1
+    end
+    _iseof(data, pos) && _err("unterminated attribute value", start)
+    pos += 1  # skip closing quote
+
+    next_state = (state == _S_XML_DECL_VALUE) ? _S_XML_DECL : _S_TAG
+    tok = Token(TOKEN_ATTR_VALUE, @inbounds SubString(data, start, pos - 1))
+    (tok, TokenizerState(pos, next_state, _no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
+function _read_comment_body(data, pos)
+    start = pos
+    while !_iseof(data, pos)
+        if _peek(data, pos) == UInt8('-') &&
+           _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+           _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+            content_end = prevind(data, pos)
+            close_start = pos
+            pos += 3
+            pending = Token(TOKEN_COMMENT_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_COMMENT_CONTENT, @inbounds SubString(data, start, content_end))
+            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+        end
+        pos += 1
+    end
+    _err("unterminated comment", start)
+end
+
+function _read_cdata_body(data, pos)
+    start = pos
+    while !_iseof(data, pos)
+        if _peek(data, pos) == UInt8(']') &&
+           _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8(']') &&
+           _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+            content_end = prevind(data, pos)
+            close_start = pos
+            pos += 3
+            pending = Token(TOKEN_CDATA_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_CDATA_CONTENT, @inbounds SubString(data, start, content_end))
+            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+        end
+        pos += 1
+    end
+    _err("unterminated CDATA section", start)
+end
+
+function _read_pi_body(data, pos)
+    start = pos
+    while !_iseof(data, pos)
+        if _peek(data, pos) == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+            content_end = prevind(data, pos)
+            close_start = pos
+            pos += 2
+            pending = Token(TOKEN_PI_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_PI_CONTENT, @inbounds SubString(data, start, content_end))
+            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+        end
+        pos += 1
+    end
+    _err("unterminated processing instruction", start)
+end
+
+function _read_doctype_body(data, pos)
+    start = pos
+    depth = 0
+    while !_iseof(data, pos)
+        b = _peek(data, pos)
+        if b == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+                pos >= 2 &&
+                codeunit(data, pos - 1) == UInt8('!') &&
+                codeunit(data, pos - 2) == UInt8('<')
+            # Inside a <!-- comment: skip until -->
+            pos += 2  # skip "--"
+            while !_iseof(data, pos)
+                if _peek(data, pos) == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+                        _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+                    pos += 3  # skip "-->"
+                    break
+                end
+                pos += 1
+            end
+        elseif b == UInt8('"') || b == UInt8('\'')
+            pos = _skip_quoted(data, pos)
+        elseif b == UInt8('[')
+            depth += 1
+            pos += 1
+        elseif b == UInt8(']')
+            depth -= 1
+            pos += 1
+        elseif b == UInt8('>') && depth == 0
+            content_end = prevind(data, pos)
+            close_start = pos
+            pos += 1
+            pending = Token(TOKEN_DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end))
+            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+        else
+            pos += 1
+        end
+    end
+    _err("unterminated DOCTYPE", start)
+end
+
+#-----------------------------------------------------------------------# Utility functions
+
+"""
+    tag_name(token::Token) -> SubString{String}
+
+Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token.
+"""
+function tag_name(token::Token)
+    if token.kind == TOKEN_OPEN_TAG
+        @inbounds SubString(token.raw, 2, ncodeunits(token.raw))  # skip '<'
+    elseif token.kind == TOKEN_CLOSE_TAG
+        @inbounds SubString(token.raw, 3, ncodeunits(token.raw))  # skip '</'
+    else
+        throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)"))
+    end
+end
+
+"""
+    attr_value(token::Token) -> SubString{String}
+
+Strip the surrounding quotes from an `ATTR_VALUE` token.
+"""
+function attr_value(token::Token)
+    token.kind == TOKEN_ATTR_VALUE ||
+        throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)"))
+    @inbounds SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw)))
+end
+
+"""
+    pi_target(token::Token) -> SubString{String}
+
+Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token.
+"""
+function pi_target(token::Token)
+    (token.kind == TOKEN_PI_OPEN || token.kind == TOKEN_XML_DECL_OPEN) ||
+        throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)"))
+    @inbounds SubString(token.raw, 3, ncodeunits(token.raw))  # skip '<?'
+end
+
+end # module XMLTokenizer
diff --git a/src/lazynode.jl b/src/lazynode.jl
index 1eeb1a9..2705a93 100644
--- a/src/lazynode.jl
+++ b/src/lazynode.jl
@@ -13,21 +13,25 @@ Supports the same read-only interface as `Node`: [`nodetype`](@ref), [`tag`](@re
 """
 struct LazyNode{S <: AbstractString}
     data::S
-    pos::Int
+    token::Token{S}
     nodetype::NodeType
 end
 
+LazyNode(data::S, ::Type{Document}) where {S <: AbstractString} =
+    LazyNode{S}(data, Token(TOKEN_TEXT, SubString(data, 1, 0)), Document)
+
 nodetype(n::LazyNode) = n.nodetype
 
-_lazy_tokenizer(n::LazyNode) = tokenize(n.data, n.pos)
+_lazy_pos(n::LazyNode) = n.token.raw.offset + 1
+_lazy_tokenizer(n::LazyNode) = tokenize(n.data, _lazy_pos(n))
 
 #-----------------------------------------------------------------------------# tag / value
 function tag(n::LazyNode)
     nt = n.nodetype
     if nt === Element
-        return String(tag_name(first(_lazy_tokenizer(n))))
+        return String(tag_name(n.token))
     elseif nt === ProcessingInstruction
-        return String(pi_target(first(_lazy_tokenizer(n))))
+        return String(pi_target(n.token))
     end
     nothing
 end
@@ -35,7 +39,7 @@ end
 function value(n::LazyNode)
     nt = n.nodetype
     if nt === Text
-        return unescape(first(_lazy_tokenizer(n)).raw)
+        return unescape(n.token.raw)
     elseif nt === Comment
         iter = _lazy_tokenizer(n)
         iterate(iter)  # COMMENT_OPEN
@@ -136,26 +140,25 @@ function _lazy_collect_children(data::S, iter) where {S <: AbstractString}
     result = LazyNode{S}[]
     for tok in iter
         k = tok.kind
-        pos = tok.raw.offset + 1
         if k === TOKEN_TEXT
-            push!(result, LazyNode(data, pos, Text))
+            push!(result, LazyNode(data, tok, Text))
         elseif k === TOKEN_OPEN_TAG
-            push!(result, LazyNode(data, pos, Element))
+            push!(result, LazyNode(data, tok, Element))
             _lazy_skip_element!(iter)
         elseif k === TOKEN_COMMENT_OPEN
-            push!(result, LazyNode(data, pos, Comment))
+            push!(result, LazyNode(data, tok, Comment))
             _lazy_skip_until!(iter, TOKEN_COMMENT_CLOSE)
         elseif k === TOKEN_CDATA_OPEN
-            push!(result, LazyNode(data, pos, CData))
+            push!(result, LazyNode(data, tok, CData))
             _lazy_skip_until!(iter, TOKEN_CDATA_CLOSE)
         elseif k === TOKEN_PI_OPEN
-            push!(result, LazyNode(data, pos, ProcessingInstruction))
+            push!(result, LazyNode(data, tok, ProcessingInstruction))
             _lazy_skip_until!(iter, TOKEN_PI_CLOSE)
         elseif k === TOKEN_XML_DECL_OPEN
-            push!(result, LazyNode(data, pos, Declaration))
+            push!(result, LazyNode(data, tok, Declaration))
             _lazy_skip_until!(iter, TOKEN_XML_DECL_CLOSE)
         elseif k === TOKEN_DOCTYPE_OPEN
-            push!(result, LazyNode(data, pos, DTD))
+            push!(result, LazyNode(data, tok, DTD))
             _lazy_skip_until!(iter, TOKEN_DOCTYPE_CLOSE)
         elseif k === TOKEN_CLOSE_TAG
             break
@@ -216,7 +219,7 @@ Base.length(n::LazyNode) = length(children(n))
 
 #-----------------------------------------------------------------------------# parse / read
 Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode)
-Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), 1, Document)
+Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), Document)
 
 Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode)
 Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode)
diff --git a/src/tokenizer.jl b/src/tokenizer.jl
deleted file mode 100644
index be8598f..0000000
--- a/src/tokenizer.jl
+++ /dev/null
@@ -1,481 +0,0 @@
-"""
-    XMLTokenizer
-
-A self-contained module for tokenizing XML documents into a fine-grained stream of tokens.
-
-# Usage
-
-```julia
-using .XMLTokenizer: tokenize, tag_name, attr_value, pi_target
-
-for token in tokenize(\"\"\"<?xml version="1.0"?><root attr="val">text<!-- comment --></root>\"\"\")
-    println(token)
-end
-```
-"""
-module XMLTokenizer
-
-export tokenize, tag_name, attr_value, pi_target, TokenKind, Token,
-    TOKEN_TEXT,
-    TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
-    TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
-    TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
-    TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
-    TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
-    TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
-    TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
-
-#-----------------------------------------------------------------------# TokenKind
-@enum TokenKind::UInt8 begin
-    # Character data
-    TOKEN_TEXT               # text content between markup
-
-    # Element tags
-    TOKEN_OPEN_TAG           # <name
-    TOKEN_CLOSE_TAG          # </name
-    TOKEN_TAG_CLOSE          # >
-    TOKEN_SELF_CLOSE         # />
-    TOKEN_ATTR_NAME          # attribute name
-    TOKEN_ATTR_VALUE         # "value" or 'value' (with quotes in raw)
-
-    # CDATA sections
-    TOKEN_CDATA_OPEN         # <![CDATA[
-    TOKEN_CDATA_CONTENT      # raw text content
-    TOKEN_CDATA_CLOSE        # ]]>
-
-    # Comments
-    TOKEN_COMMENT_OPEN       # <!--
-    TOKEN_COMMENT_CONTENT    # comment text
-    TOKEN_COMMENT_CLOSE      # -->
-
-    # Processing instructions
-    TOKEN_PI_OPEN            # <?target (includes target name)
-    TOKEN_PI_CONTENT         # PI body text
-    TOKEN_PI_CLOSE           # ?>
-
-    # XML declaration (<?xml ...?>)
-    TOKEN_XML_DECL_OPEN      # <?xml
-    TOKEN_XML_DECL_CLOSE     # ?>
-    # (reuses TOKEN_ATTR_NAME / TOKEN_ATTR_VALUE for pseudo-attributes)
-
-    # DOCTYPE
-    TOKEN_DOCTYPE_OPEN       # <!DOCTYPE (or other <! declarations)
-    TOKEN_DOCTYPE_CONTENT    # declaration body
-    TOKEN_DOCTYPE_CLOSE      # >
-end
-
-#-----------------------------------------------------------------------# Token
-struct Token{S <: AbstractString}
-    kind::TokenKind
-    raw::SubString{S}
-end
-
-function Base.show(io::IO, t::Token)
-    print(io, t.kind, ": ", repr(String(t.raw)))
-end
-
-#-----------------------------------------------------------------------# Tokenizer state
-@enum _State::UInt8 begin
-    _S_DEFAULT            # normal content mode
-    _S_TAG                # inside open tag, reading attributes
-    _S_TAG_VALUE          # expecting quoted attribute value
-    _S_CLOSE_TAG          # inside close tag, expecting >
-    _S_XML_DECL           # inside <?xml, reading pseudo-attributes
-    _S_XML_DECL_VALUE     # expecting quoted attr value in xml decl
-    _S_COMMENT            # after <!--, reading content
-    _S_CDATA              # after <![CDATA[, reading content
-    _S_PI                 # after <?target, reading content
-    _S_DOCTYPE            # after <!DOCTYPE, reading content
-end
-
-#-----------------------------------------------------------------------# Tokenizer
-mutable struct Tokenizer{S <: AbstractString}
-    const data::S
-    pos::Int
-    state::_State
-    pending::Union{Token{S},Nothing}
-end
-
-"""
-    tokenize(xml::AbstractString) -> Tokenizer
-
-Return a lazy iterator of `Token`s over the XML string `xml`.
-"""
-tokenize(xml::AbstractString) = Tokenizer(xml, 1, _S_DEFAULT, nothing)
-tokenize(xml::AbstractString, pos::Int) = Tokenizer(xml, pos, _S_DEFAULT, nothing)
-
-Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
-Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S}
-
-function Base.iterate(t::Tokenizer, _=nothing)
-    tok = _next_token!(t)
-    tok === nothing ? nothing : (tok, nothing)
-end
-
-#-----------------------------------------------------------------------# Internal helpers
-@inline _iseof(t::Tokenizer) = t.pos > ncodeunits(t.data)
-@inline _peek(t::Tokenizer) = @inbounds codeunit(t.data, t.pos)
-@inline _peek(t::Tokenizer, offset::Int) = @inbounds codeunit(t.data, t.pos + offset)
-@inline _canpeek(t::Tokenizer, offset::Int) = t.pos + offset <= ncodeunits(t.data)
-
-@inline function _is_name_byte(b::UInt8)
-    (UInt8('a') <= b <= UInt8('z')) || (UInt8('A') <= b <= UInt8('Z')) ||
-    (UInt8('0') <= b <= UInt8('9')) || b == UInt8('_') || b == UInt8('-') ||
-    b == UInt8('.') || b == UInt8(':')
-end
-
-@inline function _is_whitespace(b::UInt8)
-    b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r')
-end
-
-function _skip_whitespace!(t::Tokenizer)
-    while !_iseof(t) && _is_whitespace(_peek(t))
-        t.pos += 1
-    end
-end
-
-function _skip_quoted!(t::Tokenizer)
-    q = _peek(t)
-    t.pos += 1
-    while !_iseof(t)
-        _peek(t) == q && (t.pos += 1; return)
-        t.pos += 1
-    end
-    error("Unterminated quoted string")
-end
-
-@noinline _err(msg, pos) = throw(ArgumentError("XML tokenizer error at position $pos: $msg"))
-
-#-----------------------------------------------------------------------# Main dispatch
-function _next_token!(t::Tokenizer)
-    if t.pending !== nothing
-        tok = t.pending::Token
-        t.pending = nothing
-        return tok
-    end
-    _iseof(t) && return nothing
-
-    s = t.state
-    if s == _S_DEFAULT
-        _peek(t) == UInt8('<') ? _read_markup!(t) : _read_text!(t)
-    elseif s == _S_TAG || s == _S_XML_DECL
-        _read_in_tag!(t)
-    elseif s == _S_TAG_VALUE || s == _S_XML_DECL_VALUE
-        _read_attr_value!(t)
-    elseif s == _S_CLOSE_TAG
-        _read_close_tag_end!(t)
-    elseif s == _S_COMMENT
-        _read_comment_body!(t)
-    elseif s == _S_CDATA
-        _read_cdata_body!(t)
-    elseif s == _S_PI
-        _read_pi_body!(t)
-    else  # _S_DOCTYPE
-        _read_doctype_body!(t)
-    end
-end
-
-#-----------------------------------------------------------------------# S_DEFAULT tokens
-function _read_text!(t::Tokenizer)
-    start = t.pos
-    while !_iseof(t) && _peek(t) != UInt8('<')
-        t.pos += 1
-    end
-    Token(TOKEN_TEXT, SubString(t.data, start, prevind(t.data, t.pos)))
-end
-
-function _read_markup!(t::Tokenizer)
-    start = t.pos
-    t.pos += 1  # skip '<'
-    _iseof(t) && _err("unexpected end of input after '<'", start)
-
-    b = _peek(t)
-    if b == UInt8('!')
-        _read_bang!(t, start)
-    elseif b == UInt8('?')
-        _read_pi_start!(t, start)
-    elseif b == UInt8('/')
-        _read_close_tag_start!(t, start)
-    else
-        _read_open_tag_start!(t, start)
-    end
-end
-
-#-----------------------------------------------------------------------# <! dispatch
-function _read_bang!(t::Tokenizer, start::Int)
-    t.pos += 1  # skip '!'
-
-    # Comment: <!--
-    if !_iseof(t) && _peek(t) == UInt8('-')
-        t.pos += 1
-        (!_iseof(t) && _peek(t) == UInt8('-')) || _err("expected '<!--'", start)
-        t.pos += 1
-        t.state = _S_COMMENT
-        return Token(TOKEN_COMMENT_OPEN, SubString(t.data, start, t.pos - 1))
-    end
-
-    # CDATA: <![CDATA[
-    if !_iseof(t) && _peek(t) == UInt8('[')
-        t.pos += 1
-        for expected in (UInt8('C'), UInt8('D'), UInt8('A'), UInt8('T'), UInt8('A'), UInt8('['))
-            _iseof(t) && _err("unterminated CDATA", start)
-            _peek(t) == expected || _err("invalid CDATA section", start)
-            t.pos += 1
-        end
-        t.state = _S_CDATA
-        return Token(TOKEN_CDATA_OPEN, SubString(t.data, start, t.pos - 1))
-    end
-
-    # <!DOCTYPE ...> or other <! declaration
-    while !_iseof(t) && _is_name_byte(_peek(t))
-        t.pos += 1
-    end
-    t.state = _S_DOCTYPE
-    Token(TOKEN_DOCTYPE_OPEN, SubString(t.data, start, t.pos - 1))
-end
-
-#-----------------------------------------------------------------------# <? (PI / XML declaration)
-function _read_pi_start!(t::Tokenizer, start::Int)
-    t.pos += 1  # skip '?'
-    name_start = t.pos
-    while !_iseof(t) && _is_name_byte(_peek(t))
-        t.pos += 1
-    end
-
-    is_xml = (t.pos - name_start == 3) &&
-        codeunit(t.data, name_start)     == UInt8('x') &&
-        codeunit(t.data, name_start + 1) == UInt8('m') &&
-        codeunit(t.data, name_start + 2) == UInt8('l')
-
-    if is_xml
-        t.state = _S_XML_DECL
-        Token(TOKEN_XML_DECL_OPEN, SubString(t.data, start, t.pos - 1))
-    else
-        t.state = _S_PI
-        Token(TOKEN_PI_OPEN, SubString(t.data, start, t.pos - 1))
-    end
-end
-
-#-----------------------------------------------------------------------# Tags
-function _read_open_tag_start!(t::Tokenizer, start::Int)
-    while !_iseof(t) && _is_name_byte(_peek(t))
-        t.pos += 1
-    end
-    t.state = _S_TAG
-    Token(TOKEN_OPEN_TAG, SubString(t.data, start, t.pos - 1))
-end
-
-function _read_close_tag_start!(t::Tokenizer, start::Int)
-    t.pos += 1  # skip '/'
-    while !_iseof(t) && _is_name_byte(_peek(t))
-        t.pos += 1
-    end
-    t.state = _S_CLOSE_TAG
-    Token(TOKEN_CLOSE_TAG, SubString(t.data, start, t.pos - 1))
-end
-
-function _read_close_tag_end!(t::Tokenizer)
-    _skip_whitespace!(t)
-    _iseof(t) && _err("unterminated close tag", t.pos)
-    _peek(t) == UInt8('>') || _err("expected '>'", t.pos)
-    start = t.pos
-    t.pos += 1
-    t.state = _S_DEFAULT
-    Token(TOKEN_TAG_CLOSE, SubString(t.data, start, start))
-end
-
-#-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL)
-function _read_in_tag!(t::Tokenizer)
-    _skip_whitespace!(t)
-    _iseof(t) && _err("unterminated tag", t.pos)
-
-    b = _peek(t)
-    is_decl = (t.state == _S_XML_DECL)
-
-    # Check for end delimiters
-    if is_decl
-        if b == UInt8('?') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
-            start = t.pos; t.pos += 2; t.state = _S_DEFAULT
-            return Token(TOKEN_XML_DECL_CLOSE, SubString(t.data, start, t.pos - 1))
-        end
-    else
-        if b == UInt8('>')
-            start = t.pos; t.pos += 1; t.state = _S_DEFAULT
-            return Token(TOKEN_TAG_CLOSE, SubString(t.data, start, start))
-        end
-        if b == UInt8('/') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
-            start = t.pos; t.pos += 2; t.state = _S_DEFAULT
-            return Token(TOKEN_SELF_CLOSE, SubString(t.data, start, t.pos - 1))
-        end
-    end
-
-    # Attribute name
-    name_start = t.pos
-    while !_iseof(t) && _is_name_byte(_peek(t))
-        t.pos += 1
-    end
-    name_end = t.pos - 1
-    name_start > name_end && _err("expected attribute name or tag close", t.pos)
-
-    # Consume '=' and surrounding whitespace (not part of any token)
-    _skip_whitespace!(t)
-    (!_iseof(t) && _peek(t) == UInt8('=')) || _err("expected '=' after attribute name", t.pos)
-    t.pos += 1
-    _skip_whitespace!(t)
-
-    t.state = is_decl ? _S_XML_DECL_VALUE : _S_TAG_VALUE
-    Token(TOKEN_ATTR_NAME, SubString(t.data, name_start, name_end))
-end
-
-function _read_attr_value!(t::Tokenizer)
-    _iseof(t) && _err("expected attribute value", t.pos)
-
-    q = _peek(t)
-    (q == UInt8('"') || q == UInt8('\'')) || _err("expected quoted attribute value", t.pos)
-
-    start = t.pos
-    t.pos += 1  # skip opening quote
-    while !_iseof(t) && _peek(t) != q
-        t.pos += 1
-    end
-    _iseof(t) && _err("unterminated attribute value", start)
-    t.pos += 1  # skip closing quote
-
-    t.state = (t.state == _S_XML_DECL_VALUE) ? _S_XML_DECL : _S_TAG
-    Token(TOKEN_ATTR_VALUE, SubString(t.data, start, prevind(t.data, t.pos)))
-end
-
-#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
-function _read_comment_body!(t::Tokenizer)
-    start = t.pos
-    while !_iseof(t)
-        if _peek(t) == UInt8('-') &&
-           _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
-           _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
-            content_end = prevind(t.data, t.pos)
-            close_start = t.pos
-            t.pos += 3
-            t.state = _S_DEFAULT
-            t.pending = Token(TOKEN_COMMENT_CLOSE, SubString(t.data, close_start, t.pos - 1))
-            return Token(TOKEN_COMMENT_CONTENT, SubString(t.data, start, content_end))
-        end
-        t.pos += 1
-    end
-    _err("unterminated comment", start)
-end
-
-function _read_cdata_body!(t::Tokenizer)
-    start = t.pos
-    while !_iseof(t)
-        if _peek(t) == UInt8(']') &&
-           _canpeek(t, 1) && _peek(t, 1) == UInt8(']') &&
-           _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
-            content_end = prevind(t.data, t.pos)
-            close_start = t.pos
-            t.pos += 3
-            t.state = _S_DEFAULT
-            t.pending = Token(TOKEN_CDATA_CLOSE, SubString(t.data, close_start, t.pos - 1))
-            return Token(TOKEN_CDATA_CONTENT, SubString(t.data, start, content_end))
-        end
-        t.pos += 1
-    end
-    _err("unterminated CDATA section", start)
-end
-
-function _read_pi_body!(t::Tokenizer)
-    start = t.pos
-    while !_iseof(t)
-        if _peek(t) == UInt8('?') && _canpeek(t, 1) && _peek(t, 1) == UInt8('>')
-            content_end = prevind(t.data, t.pos)
-            close_start = t.pos
-            t.pos += 2
-            t.state = _S_DEFAULT
-            t.pending = Token(TOKEN_PI_CLOSE, SubString(t.data, close_start, t.pos - 1))
-            return Token(TOKEN_PI_CONTENT, SubString(t.data, start, content_end))
-        end
-        t.pos += 1
-    end
-    _err("unterminated processing instruction", start)
-end
-
-function _read_doctype_body!(t::Tokenizer)
-    start = t.pos
-    depth = 0
-    while !_iseof(t)
-        b = _peek(t)
-        if b == UInt8('-') && _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
-                t.pos >= 2 &&
-                codeunit(t.data, t.pos - 1) == UInt8('!') &&
-                codeunit(t.data, t.pos - 2) == UInt8('<')
-            # Inside a <!-- comment: skip until -->
-            t.pos += 2  # skip "--"
-            while !_iseof(t)
-                if _peek(t) == UInt8('-') && _canpeek(t, 1) && _peek(t, 1) == UInt8('-') &&
-                        _canpeek(t, 2) && _peek(t, 2) == UInt8('>')
-                    t.pos += 3  # skip "-->"
-                    break
-                end
-                t.pos += 1
-            end
-        elseif b == UInt8('"') || b == UInt8('\'')
-            _skip_quoted!(t)
-        elseif b == UInt8('[')
-            depth += 1
-            t.pos += 1
-        elseif b == UInt8(']')
-            depth -= 1
-            t.pos += 1
-        elseif b == UInt8('>') && depth == 0
-            content_end = prevind(t.data, t.pos)
-            close_start = t.pos
-            t.pos += 1
-            t.state = _S_DEFAULT
-            t.pending = Token(TOKEN_DOCTYPE_CLOSE, SubString(t.data, close_start, t.pos - 1))
-            return Token(TOKEN_DOCTYPE_CONTENT, SubString(t.data, start, content_end))
-        else
-            t.pos += 1
-        end
-    end
-    _err("unterminated DOCTYPE", start)
-end
-
-#-----------------------------------------------------------------------# Utility functions
-
-"""
-    tag_name(token::Token) -> SubString{String}
-
-Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token.
-"""
-function tag_name(token::Token)
-    if token.kind == TOKEN_OPEN_TAG
-        SubString(token.raw, 2, ncodeunits(token.raw))  # skip '<'
-    elseif token.kind == TOKEN_CLOSE_TAG
-        SubString(token.raw, 3, ncodeunits(token.raw))  # skip '</'
-    else
-        throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)"))
-    end
-end
-
-"""
-    attr_value(token::Token) -> SubString{String}
-
-Strip the surrounding quotes from an `ATTR_VALUE` token.
-"""
-function attr_value(token::Token)
-    token.kind == TOKEN_ATTR_VALUE ||
-        throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)"))
-    SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw)))
-end
-
-"""
-    pi_target(token::Token) -> SubString{String}
-
-Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token.
-"""
-function pi_target(token::Token)
-    (token.kind == TOKEN_PI_OPEN || token.kind == TOKEN_XML_DECL_OPEN) ||
-        throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)"))
-    SubString(token.raw, 3, ncodeunits(token.raw))  # skip '<?'
-end
-
-end # module XMLTokenizer

From 8483fed578e74c8a6117a3149286cdf22344f155 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Fri, 6 Mar 2026 20:31:25 -0500
Subject: [PATCH 11/18] Add benchmarks, StringViews tests, simplify XML module

---
 Project.toml             |   1 -
 benchmarks/compare.jl    | 209 +++++++++++++++++++++++++++++++++++++++
 src/XML.jl               |  56 ++++-------
 src/lazynode.jl          |   5 +-
 test/Project.toml        |   1 +
 test/runtests.jl         |   1 +
 test/test_stringviews.jl |  84 ++++++++++++++++
 7 files changed, 316 insertions(+), 41 deletions(-)
 create mode 100644 benchmarks/compare.jl
 create mode 100644 test/test_stringviews.jl

diff --git a/Project.toml b/Project.toml
index 70e0bfb..5e88acc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,5 +13,4 @@ StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 XMLStringViewsExt = "StringViews"
 
 [compat]
-StringViews = "1"
 julia = "1.9"
diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl
new file mode 100644
index 0000000..979b092
--- /dev/null
+++ b/benchmarks/compare.jl
@@ -0,0 +1,209 @@
+#= Compare current dev XML.jl against the last released version.
+
+Usage:
+    julia benchmarks/compare.jl [tag]
+
+`tag` defaults to the latest git tag (e.g. v0.3.8).
+
+This script:
+1. Runs benchmarks using the current (dev) code
+2. Checks out the release tag into a temp worktree
+3. Runs the same benchmarks against that version
+4. Prints a side-by-side comparison
+=#
+
+using BenchmarkTools, Serialization
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+const ROOT = dirname(@__DIR__)
+
+const RELEASE_TAG = if length(ARGS) >= 1
+    ARGS[1]
+else
+    tags = readlines(`git -C $ROOT tag --sort=version:refname`)
+    filter!(t -> startswith(t, "v"), tags)
+    last(tags)
+end
+
+const SMALL_FILE = joinpath(ROOT, "test", "data", "books.xml")
+const SMALL_XML = read(SMALL_FILE, String)
+
+# Generate medium file if needed
+include(joinpath(ROOT, "benchmarks", "XMarkGenerator.jl"))
+using .XMarkGenerator
+const MEDIUM_FILE = joinpath(ROOT, "benchmarks", "data", "xmark.xml")
+if !isfile(MEDIUM_FILE)
+    mkpath(dirname(MEDIUM_FILE))
+    @info "Generating XMark benchmark XML..."
+    generate_xmark(MEDIUM_FILE, 1.0)
+end
+const MEDIUM_XML = read(MEDIUM_FILE, String)
+
+#-----------------------------------------------------------------------------# Helpers
+function _collect_tags!(out, node)
+    for c in XML.children(node)
+        if XML.nodetype(c) === XML.Element
+            push!(out, XML.tag(c))
+            _collect_tags!(out, c)
+        end
+    end
+end
+
+function bench_collect_tags(node)
+    out = String[]
+    _collect_tags!(out, node)
+    out
+end
+
+#-----------------------------------------------------------------------------# Run dev benchmarks
+println("="^60)
+println("  XML.jl Benchmark Comparison")
+println("  Current (dev) vs $RELEASE_TAG")
+println("="^60)
+println()
+
+print("Running dev benchmarks...")
+flush(stdout)
+
+using XML
+
+dev_results = Dict{String, BenchmarkTools.Trial}()
+
+dev_small = parse(SMALL_XML, Node)
+dev_medium = parse(MEDIUM_XML, Node)
+
+dev_results["Parse (small)"] = @benchmark parse($SMALL_XML, Node)
+dev_results["Parse (small, SS)"] = @benchmark parse($SMALL_XML, Node{SubString{String}})
+dev_results["Parse (medium)"] = @benchmark parse($MEDIUM_XML, Node)
+dev_results["Parse (medium, SS)"] = @benchmark parse($MEDIUM_XML, Node{SubString{String}})
+dev_results["Write (small)"] = @benchmark XML.write($dev_small)
+dev_results["Write (medium)"] = @benchmark XML.write($dev_medium)
+dev_results["Read file (medium)"] = @benchmark read($MEDIUM_FILE, Node)
+dev_results["Collect tags (small)"] = @benchmark bench_collect_tags($dev_small)
+dev_results["Collect tags (medium)"] = @benchmark bench_collect_tags($dev_medium)
+
+println(" done")
+
+#-----------------------------------------------------------------------------# Run release benchmarks via temp worktree + separate process
+print("Setting up $RELEASE_TAG worktree...")
+flush(stdout)
+
+worktree_dir = mktempdir()
+run(pipeline(`git -C $ROOT worktree add $worktree_dir $RELEASE_TAG`, stdout=devnull, stderr=devnull))
+println(" done")
+
+release_results_file = joinpath(worktree_dir, "_results.jls")
+
+release_script = joinpath(worktree_dir, "_bench.jl")
+write(release_script, """
+using Pkg
+Pkg.activate(; temp=true)
+Pkg.develop(path=$(repr(worktree_dir)))
+Pkg.add("BenchmarkTools")
+Pkg.add("Serialization")
+
+using BenchmarkTools, Serialization, XML
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+small_xml = read($(repr(SMALL_FILE)), String)
+medium_xml = read($(repr(MEDIUM_FILE)), String)
+results = Dict{String, BenchmarkTools.Trial}()
+
+results["Parse (small)"] = @benchmark parse(\$small_xml, Node)
+
+try
+    SSNode = Node{SubString{String}}
+    results["Parse (small, SS)"] = @benchmark parse(\$small_xml, SSNode)
+    results["Parse (medium, SS)"] = @benchmark parse(\$medium_xml, SSNode)
+catch
+end
+
+results["Parse (medium)"] = @benchmark parse(\$medium_xml, Node)
+
+small_node = parse(small_xml, Node)
+medium_node = parse(medium_xml, Node)
+results["Write (small)"] = @benchmark XML.write(\$small_node)
+results["Write (medium)"] = @benchmark XML.write(\$medium_node)
+results["Read file (medium)"] = @benchmark read($(repr(MEDIUM_FILE)), Node)
+
+function _collect_tags!(out, node)
+    for c in XML.children(node)
+        if XML.nodetype(c) === XML.Element
+            push!(out, XML.tag(c))
+            _collect_tags!(out, c)
+        end
+    end
+end
+function bench_collect_tags(node)
+    out = String[]
+    _collect_tags!(out, node)
+    out
+end
+results["Collect tags (small)"] = @benchmark bench_collect_tags(\$small_node)
+results["Collect tags (medium)"] = @benchmark bench_collect_tags(\$medium_node)
+
+serialize($(repr(release_results_file)), results)
+""")
+
+print("Running $RELEASE_TAG benchmarks...")
+flush(stdout)
+run(pipeline(`julia $release_script`, stdout=devnull, stderr=devnull))
+release_results = deserialize(release_results_file)
+println(" done")
+
+# Cleanup worktree
+run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull))
+
+#-----------------------------------------------------------------------------# Compare
+println()
+println("-"^60)
+
+all_keys = [
+    "Parse (small)", "Parse (small, SS)",
+    "Parse (medium)", "Parse (medium, SS)",
+    "Write (small)", "Write (medium)",
+    "Read file (medium)",
+    "Collect tags (small)", "Collect tags (medium)",
+]
+
+for name in all_keys
+    has_dev = haskey(dev_results, name)
+    has_rel = haskey(release_results, name)
+    has_dev || has_rel || continue
+
+    println()
+    println("  $name")
+
+    if has_dev && has_rel
+        dev_med = median(dev_results[name]).time / 1e6
+        rel_med = median(release_results[name]).time / 1e6
+        change = (dev_med / rel_med - 1) * 100
+
+        pct = abs(round(change, digits=1))
+        indicator = if change < -5
+            "$(pct)% faster"
+        elseif change > 5
+            "$(pct)% slower"
+        else
+            "~same"
+        end
+
+        lpad_tag = lpad(RELEASE_TAG, 12)
+        lpad_dev = lpad("dev", 12)
+        println("    $lpad_tag  $(lpad(string(round(rel_med, digits=4), " ms"), 12))")
+        println("    $lpad_dev  $(lpad(string(round(dev_med, digits=4), " ms"), 12))  ($indicator)")
+    elseif has_dev
+        dev_med = median(dev_results[name]).time / 1e6
+        lpad_tag = lpad(RELEASE_TAG, 12)
+        lpad_dev = lpad("dev", 12)
+        println("    $lpad_tag  $(lpad("n/a", 12))")
+        println("    $lpad_dev  $(lpad(string(round(dev_med, digits=4), " ms"), 12))")
+    end
+end
+
+println()
+println("="^60)
diff --git a/src/XML.jl b/src/XML.jl
index cf450d1..24b0675 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -27,46 +27,26 @@ Escape the five XML predefined entities: `&` `<` `>` `'` `"`.
 """
 escape(x::AbstractString) = replace(x, escape_chars...)
 
+# Replace a numeric character reference with its Unicode character.
+# Numeric character references encode characters by code point: decimal (&#233; → é) or hex (&#xE9; → é).
+function _unescape_charref(ref::AbstractString)
+    is_hex = length(ref) > 3 && ref[3] in ('x', 'X')
+    digits = SubString(ref, is_hex ? 4 : 3, length(ref) - 1)
+    cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10)
+    !isnothing(cp) && isvalid(Char, cp) ? string(Char(cp)) : ref
+end
+
+"""
+    unescape(x::AbstractString) -> String
+
+Unescape XML entities in `x`: the five predefined entities (`&amp;` `&lt;` `&gt;` `&apos;`
+`&quot;`) and numeric character references (`&#123;`, `&#xAB;`).  Each reference is processed
+exactly once (no double-unescaping).
+"""
 function unescape(x::AbstractString)
     occursin('&', x) || return string(x)
-    s = string(x)
-    io = IOBuffer(sizehint=ncodeunits(s))
-    i = 1
-    while i <= ncodeunits(s)
-        if s[i] == '&'
-            j = findnext(';', s, i + 1)
-            if !isnothing(j)
-                ref = SubString(s, i, j)
-                if ref == "&amp;"
-                    print(io, '&')
-                elseif ref == "&lt;"
-                    print(io, '<')
-                elseif ref == "&gt;"
-                    print(io, '>')
-                elseif ref == "&apos;"
-                    print(io, '\'')
-                elseif ref == "&quot;"
-                    print(io, '"')
-                elseif startswith(ref, "&#")
-                    is_hex = length(ref) > 3 && (ref[3] == 'x' || ref[3] == 'X')
-                    digits = SubString(s, i + (is_hex ? 3 : 2), j - 1)
-                    cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10)
-                    if !isnothing(cp) && isvalid(Char, cp)
-                        print(io, Char(cp))
-                    else
-                        print(io, ref)
-                    end
-                else
-                    print(io, ref)
-                end
-                i = j + 1
-                continue
-            end
-        end
-        print(io, s[i])
-        i = nextind(s, i)
-    end
-    String(take!(io))
+    s = replace(string(x), r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref)
+    replace(s, "&lt;" => "<", "&gt;" => ">", "&apos;" => "'", "&quot;" => "\"", "&amp;" => "&")
 end
 
 #-----------------------------------------------------------------------------# NodeType
diff --git a/src/lazynode.jl b/src/lazynode.jl
index 2705a93..1f22513 100644
--- a/src/lazynode.jl
+++ b/src/lazynode.jl
@@ -17,8 +17,9 @@ struct LazyNode{S <: AbstractString}
     nodetype::NodeType
 end
 
-LazyNode(data::S, ::Type{Document}) where {S <: AbstractString} =
-    LazyNode{S}(data, Token(TOKEN_TEXT, SubString(data, 1, 0)), Document)
+function LazyNode(data::S, nt::NodeType) where {S <: AbstractString}
+    LazyNode{S}(data, Token(TOKEN_TEXT, SubString(data, 1, 0)), nt)
+end
 
 nodetype(n::LazyNode) = n.nodetype
 
diff --git a/test/Project.toml b/test/Project.toml
index c1703f7..6e6fc4f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 148488b..b14c6e1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2729,3 +2729,4 @@ end
 include("test_pugixml.jl")
 include("test_libexpat.jl")
 include("test_w3c.jl")
+include("test_stringviews.jl")
diff --git a/test/test_stringviews.jl b/test/test_stringviews.jl
new file mode 100644
index 0000000..f00ce99
--- /dev/null
+++ b/test/test_stringviews.jl
@@ -0,0 +1,84 @@
+using Test, XML, StringViews
+
+@testset "XMLStringViewsExt" begin
+    xml_content = """<?xml version="1.0"?><root><child id="1">hello</child><child id="2">world</child></root>"""
+    tmpfile = tempname() * ".xml"
+    write(tmpfile, xml_content)
+
+    @testset "mmap" begin
+        doc = XML.mmap(tmpfile, LazyNode)
+        @test nodetype(doc) === Document
+        ch = children(doc)
+        @test length(ch) == 2  # declaration + root
+
+        decl = ch[1]
+        @test nodetype(decl) === Declaration
+
+        root = ch[2]
+        @test nodetype(root) === Element
+        @test tag(root) == "root"
+
+        root_children = children(root)
+        @test length(root_children) == 2
+        @test tag(root_children[1]) == "child"
+        @test tag(root_children[2]) == "child"
+        @test value(children(root_children[1])[1]) == "hello"
+        @test value(children(root_children[2])[1]) == "world"
+    end
+
+    @testset "mmap attributes" begin
+        doc = XML.mmap(tmpfile, LazyNode)
+        root = children(doc)[2]
+        child1 = children(root)[1]
+        @test child1["id"] == "1"
+        child2 = children(root)[2]
+        @test child2["id"] == "2"
+    end
+
+    @testset "parse StringView" begin
+        sv = StringView(Vector{UInt8}(xml_content))
+        doc = parse(sv, LazyNode)
+        @test nodetype(doc) === Document
+        root = children(doc)[2]
+        @test tag(root) == "root"
+        @test length(children(root)) == 2
+    end
+
+    @testset "mmap with complex document" begin
+        complex_xml = """<?xml version="1.0"?>
+<catalog>
+  <book id="bk101">
+    <title>XML Developer's Guide</title>
+    <price>44.95</price>
+  </book>
+  <!-- a comment -->
+  <![CDATA[some raw data]]>
+</catalog>"""
+        tmpfile2 = tempname() * ".xml"
+        write(tmpfile2, complex_xml)
+
+        doc = XML.mmap(tmpfile2, LazyNode)
+        root = last(c for c in children(doc) if nodetype(c) === Element)
+        @test tag(root) == "catalog"
+
+        ch = children(root)
+        book = first(c for c in ch if nodetype(c) === Element)
+        @test tag(book) == "book"
+        @test book["id"] == "bk101"
+
+        title = first(c for c in children(book) if nodetype(c) === Element && tag(c) == "title")
+        @test simple_value(title) == "XML Developer's Guide"
+
+        comments = [c for c in ch if nodetype(c) === Comment]
+        @test length(comments) == 1
+        @test value(comments[1]) == " a comment "
+
+        cdatas = [c for c in ch if nodetype(c) === CData]
+        @test length(cdatas) == 1
+        @test value(cdatas[1]) == "some raw data"
+
+        rm(tmpfile2, force=true)
+    end
+
+    rm(tmpfile, force=true)
+end

From eb5caebd93ca1dff87256e63d72973ca2848e468 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Fri, 6 Mar 2026 21:10:10 -0500
Subject: [PATCH 12/18] Add GC.gc before tmpfile cleanup for Windows

---
 test/test_stringviews.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_stringviews.jl b/test/test_stringviews.jl
index f00ce99..051cecf 100644
--- a/test/test_stringviews.jl
+++ b/test/test_stringviews.jl
@@ -80,5 +80,6 @@ using Test, XML, StringViews
         rm(tmpfile2, force=true)
     end
 
+    GC.gc()  # release mmap before deleting on Windows
     rm(tmpfile, force=true)
 end

From b914bfe872d52fd45388481abd57bf0f73a2d984 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Fri, 6 Mar 2026 21:17:02 -0500
Subject: [PATCH 13/18] Bump version to v0.4.0

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5e88acc..9cf8c46 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "XML"
 uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
-version = "0.3.8"
+version = "0.4.0"
 authors = ["Josh Day <emailjoshday@gmail.com> and contributors"]
 
 [deps]

From d76c484f1a0be251fed6a2ccf1f0a7f77ed2f448 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Fri, 6 Mar 2026 21:30:23 -0500
Subject: [PATCH 14/18] Use mktempdir for temp file cleanup in StringViews
 tests

---
 test/test_stringviews.jl | 117 +++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 60 deletions(-)

diff --git a/test/test_stringviews.jl b/test/test_stringviews.jl
index 051cecf..59d8ef5 100644
--- a/test/test_stringviews.jl
+++ b/test/test_stringviews.jl
@@ -1,51 +1,53 @@
 using Test, XML, StringViews
 
 @testset "XMLStringViewsExt" begin
-    xml_content = """<?xml version="1.0"?><root><child id="1">hello</child><child id="2">world</child></root>"""
-    tmpfile = tempname() * ".xml"
-    write(tmpfile, xml_content)
+    mktempdir() do dir
+        tmpfile = joinpath(dir, "simple.xml")
+        xml_content = """<?xml version="1.0"?><root><child id="1">hello</child><child id="2">world</child></root>"""
+        write(tmpfile, xml_content)
 
-    @testset "mmap" begin
-        doc = XML.mmap(tmpfile, LazyNode)
-        @test nodetype(doc) === Document
-        ch = children(doc)
-        @test length(ch) == 2  # declaration + root
+        @testset "mmap" begin
+            doc = XML.mmap(tmpfile, LazyNode)
+            @test nodetype(doc) === Document
+            ch = children(doc)
+            @test length(ch) == 2  # declaration + root
 
-        decl = ch[1]
-        @test nodetype(decl) === Declaration
+            decl = ch[1]
+            @test nodetype(decl) === Declaration
 
-        root = ch[2]
-        @test nodetype(root) === Element
-        @test tag(root) == "root"
+            root = ch[2]
+            @test nodetype(root) === Element
+            @test tag(root) == "root"
 
-        root_children = children(root)
-        @test length(root_children) == 2
-        @test tag(root_children[1]) == "child"
-        @test tag(root_children[2]) == "child"
-        @test value(children(root_children[1])[1]) == "hello"
-        @test value(children(root_children[2])[1]) == "world"
-    end
+            root_children = children(root)
+            @test length(root_children) == 2
+            @test tag(root_children[1]) == "child"
+            @test tag(root_children[2]) == "child"
+            @test value(children(root_children[1])[1]) == "hello"
+            @test value(children(root_children[2])[1]) == "world"
+        end
 
-    @testset "mmap attributes" begin
-        doc = XML.mmap(tmpfile, LazyNode)
-        root = children(doc)[2]
-        child1 = children(root)[1]
-        @test child1["id"] == "1"
-        child2 = children(root)[2]
-        @test child2["id"] == "2"
-    end
+        @testset "mmap attributes" begin
+            doc = XML.mmap(tmpfile, LazyNode)
+            root = children(doc)[2]
+            child1 = children(root)[1]
+            @test child1["id"] == "1"
+            child2 = children(root)[2]
+            @test child2["id"] == "2"
+        end
 
-    @testset "parse StringView" begin
-        sv = StringView(Vector{UInt8}(xml_content))
-        doc = parse(sv, LazyNode)
-        @test nodetype(doc) === Document
-        root = children(doc)[2]
-        @test tag(root) == "root"
-        @test length(children(root)) == 2
-    end
+        @testset "parse StringView" begin
+            sv = StringView(Vector{UInt8}(xml_content))
+            doc = parse(sv, LazyNode)
+            @test nodetype(doc) === Document
+            root = children(doc)[2]
+            @test tag(root) == "root"
+            @test length(children(root)) == 2
+        end
 
-    @testset "mmap with complex document" begin
-        complex_xml = """<?xml version="1.0"?>
+        @testset "mmap with complex document" begin
+            tmpfile2 = joinpath(dir, "complex.xml")
+            complex_xml = """<?xml version="1.0"?>
 <catalog>
   <book id="bk101">
     <title>XML Developer's Guide</title>
@@ -54,32 +56,27 @@ using Test, XML, StringViews
   <!-- a comment -->
   <![CDATA[some raw data]]>
 </catalog>"""
-        tmpfile2 = tempname() * ".xml"
-        write(tmpfile2, complex_xml)
+            write(tmpfile2, complex_xml)
 
-        doc = XML.mmap(tmpfile2, LazyNode)
-        root = last(c for c in children(doc) if nodetype(c) === Element)
-        @test tag(root) == "catalog"
+            doc = XML.mmap(tmpfile2, LazyNode)
+            root = last(c for c in children(doc) if nodetype(c) === Element)
+            @test tag(root) == "catalog"
 
-        ch = children(root)
-        book = first(c for c in ch if nodetype(c) === Element)
-        @test tag(book) == "book"
-        @test book["id"] == "bk101"
+            ch = children(root)
+            book = first(c for c in ch if nodetype(c) === Element)
+            @test tag(book) == "book"
+            @test book["id"] == "bk101"
 
-        title = first(c for c in children(book) if nodetype(c) === Element && tag(c) == "title")
-        @test simple_value(title) == "XML Developer's Guide"
+            title = first(c for c in children(book) if nodetype(c) === Element && tag(c) == "title")
+            @test simple_value(title) == "XML Developer's Guide"
 
-        comments = [c for c in ch if nodetype(c) === Comment]
-        @test length(comments) == 1
-        @test value(comments[1]) == " a comment "
+            comments = [c for c in ch if nodetype(c) === Comment]
+            @test length(comments) == 1
+            @test value(comments[1]) == " a comment "
 
-        cdatas = [c for c in ch if nodetype(c) === CData]
-        @test length(cdatas) == 1
-        @test value(cdatas[1]) == "some raw data"
-
-        rm(tmpfile2, force=true)
+            cdatas = [c for c in ch if nodetype(c) === CData]
+            @test length(cdatas) == 1
+            @test value(cdatas[1]) == "some raw data"
+        end
     end
-
-    GC.gc()  # release mmap before deleting on Windows
-    rm(tmpfile, force=true)
 end

From 41836ae8224e3866c0024768e34a803b2ec806cc Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Sun, 8 Mar 2026 15:05:52 -0400
Subject: [PATCH 15/18] Remove StringViews extension and simplify tokenizer

---
 Project.toml             |   9 ----
 README.md                |   9 ++--
 ext/XMLStringViewsExt.jl |  26 ----------
 src/XML.jl               |  17 +++----
 src/XMLTokenizer.jl      | 101 ++++++++++++++++++---------------------
 test/Project.toml        |   1 -
 test/runtests.jl         |   1 -
 test/test_stringviews.jl |  82 -------------------------------
 8 files changed, 61 insertions(+), 185 deletions(-)
 delete mode 100644 ext/XMLStringViewsExt.jl
 delete mode 100644 test/test_stringviews.jl

diff --git a/Project.toml b/Project.toml
index 9cf8c46..65925ad 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,14 +3,5 @@ uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
 version = "0.4.0"
 authors = ["Josh Day <emailjoshday@gmail.com> and contributors"]
 
-[deps]
-Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[weakdeps]
-StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
-
-[extensions]
-XMLStringViewsExt = "StringViews"
-
 [compat]
 julia = "1.9"
diff --git a/README.md b/README.md
index f59e00d..e50a591 100644
--- a/README.md
+++ b/README.md
@@ -206,12 +206,15 @@ doc = read("file.xml", LazyNode)
 
 ### Memory-mapped files
 
-For very large files, combine `LazyNode` with memory mapping via the `StringViews` extension:
+For very large files, combine `LazyNode` with memory mapping to avoid reading the entire file into heap memory:
 
 ```julia
-using XML, StringViews
+using XML, Mmap, StringViews
 
-doc = XML.mmap("very_large.xml", LazyNode)
+doc = open("very_large.xml") do io
+    sv = StringView(Mmap.mmap(io))
+    parse(sv, LazyNode)
+end
 ```
 
 <br>
diff --git a/ext/XMLStringViewsExt.jl b/ext/XMLStringViewsExt.jl
deleted file mode 100644
index 9af8b4d..0000000
--- a/ext/XMLStringViewsExt.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-module XMLStringViewsExt
-
-using XML
-using StringViews: StringView
-using Mmap: Mmap
-
-"""
-    XML.mmap(filename, LazyNode) -> LazyNode
-
-Memory-map `filename` and return a `LazyNode` backed by a `StringView` over the mapped bytes.
-The file contents are not copied into Julia heap memory, making this suitable for very large
-XML files.
-
-Requires `using StringViews` to activate this method.
-"""
-function XML.mmap(filename::AbstractString, ::Type{XML.LazyNode})
-    bytes = open(filename) do io
-        Mmap.mmap(io)
-    end
-    sv = StringView(bytes)
-    XML.LazyNode(sv, XML.Document)
-end
-
-Base.parse(xml::StringView, ::Type{XML.LazyNode}) = XML.LazyNode(xml, XML.Document)
-
-end # module
diff --git a/src/XML.jl b/src/XML.jl
index 24b0675..70c788a 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -10,7 +10,15 @@ export
     h
 
 include("XMLTokenizer.jl")
-using .XMLTokenizer
+using .XMLTokenizer:
+    tokenize, tag_name, attr_value, pi_target, TokenKind, Token, Tokenizer, TokenizerState,
+    TOKEN_TEXT, TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
+    TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+    TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
+    TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
+    TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
+    TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
+    TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
 
 #-----------------------------------------------------------------------------# escape/unescape
 const escape_chars = ('&' => "&amp;", '<' => "&lt;", '>' => "&gt;", '\'' => "&apos;", '"' => "&quot;")
@@ -197,13 +205,6 @@ end
 include("xpath.jl")
 include("lazynode.jl")
 
-"""
-    XML.mmap(filename, LazyNode) -> LazyNode
-
-Memory-map `filename` and return a `LazyNode` backed by a `StringView`.
-Requires `using StringViews` to activate.
-"""
-function mmap end
 
 #-----------------------------------------------------------------------------# _to_node
 _to_node(n::Node{String}) = n
diff --git a/src/XMLTokenizer.jl b/src/XMLTokenizer.jl
index 0f6c9d4..9b05ef5 100644
--- a/src/XMLTokenizer.jl
+++ b/src/XMLTokenizer.jl
@@ -15,15 +15,6 @@ end
 """
 module XMLTokenizer
 
-export tokenize, tag_name, attr_value, pi_target, TokenKind, Token,
-    TOKEN_TEXT,
-    TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
-    TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
-    TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
-    TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
-    TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
-    TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
-    TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
 
 #-----------------------------------------------------------------------# TokenKind
 @enum TokenKind::UInt8 begin
@@ -74,24 +65,24 @@ function Base.show(io::IO, t::Token)
     print(io, t.kind, ": ", repr(String(t.raw)))
 end
 
-#-----------------------------------------------------------------------# Tokenizer state
-@enum _State::UInt8 begin
-    _S_DEFAULT            # normal content mode
-    _S_TAG                # inside open tag, reading attributes
-    _S_TAG_VALUE          # expecting quoted attribute value
-    _S_CLOSE_TAG          # inside close tag, expecting >
-    _S_XML_DECL           # inside <?xml, reading pseudo-attributes
-    _S_XML_DECL_VALUE     # expecting quoted attr value in xml decl
-    _S_COMMENT            # after <!--, reading content
-    _S_CDATA              # after <![CDATA[, reading content
-    _S_PI                 # after <?target, reading content
-    _S_DOCTYPE            # after <!DOCTYPE, reading content
+#-----------------------------------------------------------------------# Tokenizer mode
+@enum _Mode::UInt8 begin
+    _M_DEFAULT            # normal content mode
+    _M_TAG                # inside open tag, reading attributes
+    _M_TAG_VALUE          # expecting quoted attribute value
+    _M_CLOSE_TAG          # inside close tag, expecting >
+    _M_XML_DECL           # inside <?xml, reading pseudo-attributes
+    _M_XML_DECL_VALUE     # expecting quoted attr value in xml decl
+    _M_COMMENT            # after <!--, reading content
+    _M_CDATA              # after <![CDATA[, reading content
+    _M_PI                 # after <?target, reading content
+    _M_DOCTYPE            # after <!DOCTYPE, reading content
 end
 
 #-----------------------------------------------------------------------# TokenizerState (immutable, SROA-friendly)
 struct TokenizerState{S <: AbstractString}
     pos::Int
-    state::_State
+    mode::_Mode
     pending::Token{S}
 end
 
@@ -99,7 +90,7 @@ end
 @inline _has_pending(st::TokenizerState) = !isempty(st.pending.raw)
 
 @inline function _init_state(data::AbstractString, pos::Int=1)
-    TokenizerState(pos, _S_DEFAULT, _no_token(data))
+    TokenizerState(pos, _M_DEFAULT, _no_token(data))
 end
 
 #-----------------------------------------------------------------------# Tokenizer (immutable iterator)
@@ -120,7 +111,7 @@ function Base.show(io::IO, t::Tokenizer)
     n = ncodeunits(t.data)
     print(io, "Tokenizer(")
     t.start > 1 && print(io, t.start, "/")
-    print(io, n, " bytes)")
+    print(io, Base.format_bytes(n), ")")
 end
 
 Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
@@ -168,27 +159,27 @@ end
 #-----------------------------------------------------------------------# Main dispatch
 function _next_token(data, st::TokenizerState)
     if _has_pending(st)
-        return (st.pending, TokenizerState(st.pos, st.state, _no_token(data)))
+        return (st.pending, TokenizerState(st.pos, st.mode, _no_token(data)))
     end
     pos = st.pos
     _iseof(data, pos) && return nothing
 
-    s = st.state
-    if s == _S_DEFAULT
+    s = st.mode
+    if s == _M_DEFAULT
         _peek(data, pos) == UInt8('<') ? _read_markup(data, pos) : _read_text(data, pos)
-    elseif s == _S_TAG || s == _S_XML_DECL
+    elseif s == _M_TAG || s == _M_XML_DECL
         _read_in_tag(data, pos, s)
-    elseif s == _S_TAG_VALUE || s == _S_XML_DECL_VALUE
+    elseif s == _M_TAG_VALUE || s == _M_XML_DECL_VALUE
         _read_attr_value(data, pos, s)
-    elseif s == _S_CLOSE_TAG
+    elseif s == _M_CLOSE_TAG
         _read_close_tag_end(data, pos)
-    elseif s == _S_COMMENT
+    elseif s == _M_COMMENT
         _read_comment_body(data, pos)
-    elseif s == _S_CDATA
+    elseif s == _M_CDATA
         _read_cdata_body(data, pos)
-    elseif s == _S_PI
+    elseif s == _M_PI
         _read_pi_body(data, pos)
-    else  # _S_DOCTYPE
+    else  # _M_DOCTYPE
         _read_doctype_body(data, pos)
     end
 end
@@ -200,7 +191,7 @@ function _read_text(data, pos)
         pos += 1
     end
     tok = Token(TOKEN_TEXT, @inbounds SubString(data, start, prevind(data, pos)))
-    (tok, TokenizerState(pos, _S_DEFAULT, _no_token(data)))
+    (tok, TokenizerState(pos, _M_DEFAULT, _no_token(data)))
 end
 
 function _read_markup(data, pos)
@@ -228,7 +219,7 @@ function _read_bang(data, pos, start)
         (!_iseof(data, pos) && _peek(data, pos) == UInt8('-')) || _err("expected '<!--'", start)
         pos += 1
         tok = Token(TOKEN_COMMENT_OPEN, @inbounds SubString(data, start, pos - 1))
-        return (tok, TokenizerState(pos, _S_COMMENT, _no_token(data)))
+        return (tok, TokenizerState(pos, _M_COMMENT, _no_token(data)))
     end
 
     # CDATA: <![CDATA[
@@ -240,7 +231,7 @@ function _read_bang(data, pos, start)
             pos += 1
         end
         tok = Token(TOKEN_CDATA_OPEN, @inbounds SubString(data, start, pos - 1))
-        return (tok, TokenizerState(pos, _S_CDATA, _no_token(data)))
+        return (tok, TokenizerState(pos, _M_CDATA, _no_token(data)))
     end
 
     # <!DOCTYPE ...> or other <! declaration
@@ -248,7 +239,7 @@ function _read_bang(data, pos, start)
         pos += 1
     end
     tok = Token(TOKEN_DOCTYPE_OPEN, @inbounds SubString(data, start, pos - 1))
-    (tok, TokenizerState(pos, _S_DOCTYPE, _no_token(data)))
+    (tok, TokenizerState(pos, _M_DOCTYPE, _no_token(data)))
 end
 
 #-----------------------------------------------------------------------# <? (PI / XML declaration)
@@ -265,10 +256,10 @@ function _read_pi_start(data, pos, start)
 
     if is_xml
         tok = Token(TOKEN_XML_DECL_OPEN, @inbounds SubString(data, start, pos - 1))
-        (tok, TokenizerState(pos, _S_XML_DECL, _no_token(data)))
+        (tok, TokenizerState(pos, _M_XML_DECL, _no_token(data)))
     else
         tok = Token(TOKEN_PI_OPEN, @inbounds SubString(data, start, pos - 1))
-        (tok, TokenizerState(pos, _S_PI, _no_token(data)))
+        (tok, TokenizerState(pos, _M_PI, _no_token(data)))
     end
 end
 
@@ -278,7 +269,7 @@ function _read_open_tag_start(data, pos, start)
         pos += 1
     end
     tok = Token(TOKEN_OPEN_TAG, @inbounds SubString(data, start, pos - 1))
-    (tok, TokenizerState(pos, _S_TAG, _no_token(data)))
+    (tok, TokenizerState(pos, _M_TAG, _no_token(data)))
 end
 
 function _read_close_tag_start(data, pos, start)
@@ -286,7 +277,7 @@ function _read_close_tag_start(data, pos, start)
         pos += 1
     end
     tok = Token(TOKEN_CLOSE_TAG, @inbounds SubString(data, start, pos - 1))
-    (tok, TokenizerState(pos, _S_CLOSE_TAG, _no_token(data)))
+    (tok, TokenizerState(pos, _M_CLOSE_TAG, _no_token(data)))
 end
 
 function _read_close_tag_end(data, pos)
@@ -294,31 +285,31 @@ function _read_close_tag_end(data, pos)
     _iseof(data, pos) && _err("unterminated close tag", pos)
     _peek(data, pos) == UInt8('>') || _err("expected '>'", pos)
     tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
-    (tok, TokenizerState(pos + 1, _S_DEFAULT, _no_token(data)))
+    (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data)))
 end
 
 #-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL)
-function _read_in_tag(data, pos, state)
+function _read_in_tag(data, pos, mode)
     pos = _skip_whitespace(data, pos)
     _iseof(data, pos) && _err("unterminated tag", pos)
 
     b = _peek(data, pos)
-    is_decl = (state == _S_XML_DECL)
+    is_decl = (mode == _M_XML_DECL)
 
     # Check for end delimiters
     if is_decl
         if b == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
             tok = Token(TOKEN_XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1))
-            return (tok, TokenizerState(pos + 2, _S_DEFAULT, _no_token(data)))
+            return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data)))
         end
     else
         if b == UInt8('>')
             tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
-            return (tok, TokenizerState(pos + 1, _S_DEFAULT, _no_token(data)))
+            return (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data)))
         end
         if b == UInt8('/') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
             tok = Token(TOKEN_SELF_CLOSE, @inbounds SubString(data, pos, pos + 1))
-            return (tok, TokenizerState(pos + 2, _S_DEFAULT, _no_token(data)))
+            return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data)))
         end
     end
 
@@ -336,12 +327,12 @@ function _read_in_tag(data, pos, state)
     pos += 1
     pos = _skip_whitespace(data, pos)
 
-    next_state = is_decl ? _S_XML_DECL_VALUE : _S_TAG_VALUE
+    next_state = is_decl ? _M_XML_DECL_VALUE : _M_TAG_VALUE
     tok = Token(TOKEN_ATTR_NAME, @inbounds SubString(data, name_start, name_end))
     (tok, TokenizerState(pos, next_state, _no_token(data)))
 end
 
-function _read_attr_value(data, pos, state)
+function _read_attr_value(data, pos, mode)
     _iseof(data, pos) && _err("expected attribute value", pos)
 
     q = _peek(data, pos)
@@ -355,7 +346,7 @@ function _read_attr_value(data, pos, state)
     _iseof(data, pos) && _err("unterminated attribute value", start)
     pos += 1  # skip closing quote
 
-    next_state = (state == _S_XML_DECL_VALUE) ? _S_XML_DECL : _S_TAG
+    next_state = (mode == _M_XML_DECL_VALUE) ? _M_XML_DECL : _M_TAG
     tok = Token(TOKEN_ATTR_VALUE, @inbounds SubString(data, start, pos - 1))
     (tok, TokenizerState(pos, next_state, _no_token(data)))
 end
@@ -372,7 +363,7 @@ function _read_comment_body(data, pos)
             pos += 3
             pending = Token(TOKEN_COMMENT_CLOSE, @inbounds SubString(data, close_start, pos - 1))
             tok = Token(TOKEN_COMMENT_CONTENT, @inbounds SubString(data, start, content_end))
-            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+            return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
     end
@@ -390,7 +381,7 @@ function _read_cdata_body(data, pos)
             pos += 3
             pending = Token(TOKEN_CDATA_CLOSE, @inbounds SubString(data, close_start, pos - 1))
             tok = Token(TOKEN_CDATA_CONTENT, @inbounds SubString(data, start, content_end))
-            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+            return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
     end
@@ -406,7 +397,7 @@ function _read_pi_body(data, pos)
             pos += 2
             pending = Token(TOKEN_PI_CLOSE, @inbounds SubString(data, close_start, pos - 1))
             tok = Token(TOKEN_PI_CONTENT, @inbounds SubString(data, start, content_end))
-            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+            return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
     end
@@ -446,7 +437,7 @@ function _read_doctype_body(data, pos)
             pos += 1
             pending = Token(TOKEN_DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1))
             tok = Token(TOKEN_DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end))
-            return (tok, TokenizerState(pos, _S_DEFAULT, pending))
+            return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         else
             pos += 1
         end
diff --git a/test/Project.toml b/test/Project.toml
index 6e6fc4f..c1703f7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,5 @@
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index b14c6e1..148488b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2729,4 +2729,3 @@ end
 include("test_pugixml.jl")
 include("test_libexpat.jl")
 include("test_w3c.jl")
-include("test_stringviews.jl")
diff --git a/test/test_stringviews.jl b/test/test_stringviews.jl
deleted file mode 100644
index 59d8ef5..0000000
--- a/test/test_stringviews.jl
+++ /dev/null
@@ -1,82 +0,0 @@
-using Test, XML, StringViews
-
-@testset "XMLStringViewsExt" begin
-    mktempdir() do dir
-        tmpfile = joinpath(dir, "simple.xml")
-        xml_content = """<?xml version="1.0"?><root><child id="1">hello</child><child id="2">world</child></root>"""
-        write(tmpfile, xml_content)
-
-        @testset "mmap" begin
-            doc = XML.mmap(tmpfile, LazyNode)
-            @test nodetype(doc) === Document
-            ch = children(doc)
-            @test length(ch) == 2  # declaration + root
-
-            decl = ch[1]
-            @test nodetype(decl) === Declaration
-
-            root = ch[2]
-            @test nodetype(root) === Element
-            @test tag(root) == "root"
-
-            root_children = children(root)
-            @test length(root_children) == 2
-            @test tag(root_children[1]) == "child"
-            @test tag(root_children[2]) == "child"
-            @test value(children(root_children[1])[1]) == "hello"
-            @test value(children(root_children[2])[1]) == "world"
-        end
-
-        @testset "mmap attributes" begin
-            doc = XML.mmap(tmpfile, LazyNode)
-            root = children(doc)[2]
-            child1 = children(root)[1]
-            @test child1["id"] == "1"
-            child2 = children(root)[2]
-            @test child2["id"] == "2"
-        end
-
-        @testset "parse StringView" begin
-            sv = StringView(Vector{UInt8}(xml_content))
-            doc = parse(sv, LazyNode)
-            @test nodetype(doc) === Document
-            root = children(doc)[2]
-            @test tag(root) == "root"
-            @test length(children(root)) == 2
-        end
-
-        @testset "mmap with complex document" begin
-            tmpfile2 = joinpath(dir, "complex.xml")
-            complex_xml = """<?xml version="1.0"?>
-<catalog>
-  <book id="bk101">
-    <title>XML Developer's Guide</title>
-    <price>44.95</price>
-  </book>
-  <!-- a comment -->
-  <![CDATA[some raw data]]>
-</catalog>"""
-            write(tmpfile2, complex_xml)
-
-            doc = XML.mmap(tmpfile2, LazyNode)
-            root = last(c for c in children(doc) if nodetype(c) === Element)
-            @test tag(root) == "catalog"
-
-            ch = children(root)
-            book = first(c for c in ch if nodetype(c) === Element)
-            @test tag(book) == "book"
-            @test book["id"] == "bk101"
-
-            title = first(c for c in children(book) if nodetype(c) === Element && tag(c) == "title")
-            @test simple_value(title) == "XML Developer's Guide"
-
-            comments = [c for c in ch if nodetype(c) === Comment]
-            @test length(comments) == 1
-            @test value(comments[1]) == " a comment "
-
-            cdatas = [c for c in ch if nodetype(c) === CData]
-            @test length(cdatas) == 1
-            @test value(cdatas[1]) == "some raw data"
-        end
-    end
-end

From b670267eb1670e82a32b9bd0518ddbb88070033f Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Sun, 8 Mar 2026 15:12:46 -0400
Subject: [PATCH 16/18] Replace printstyled with print in show methods

---
 src/XML.jl      | 32 +++++++++++++-------------------
 src/lazynode.jl | 31 ++++++++++++-------------------
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/src/XML.jl b/src/XML.jl
index 70c788a..67f6903 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -332,46 +332,40 @@ end
 #-----------------------------------------------------------------------------# show (REPL)
 function Base.show(io::IO, o::Node)
     nt = o.nodetype
-    printstyled(io, nt; color=:light_green)
+    print(io, nt)
     if nt === Text
-        printstyled(io, ' ', repr(o.value))
+        print(io, ' ', repr(o.value))
     elseif nt === Element
-        printstyled(io, " <", o.tag; color=:light_cyan)
+        print(io, " <", o.tag)
         if !isnothing(o.attributes)
             for (k, v) in o.attributes
                 print(io, ' ', k, '=', '"', v, '"')
             end
         end
-        printstyled(io, '>'; color=:light_cyan)
+        print(io, '>')
         n = length(children(o))
-        n > 0 && printstyled(io, n == 1 ? " (1 child)" : " ($n children)"; color=:light_black)
+        n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
     elseif nt === DTD
-        printstyled(io, " <!DOCTYPE "; color=:light_cyan)
-        printstyled(io, o.value; color=:light_black)
-        printstyled(io, '>'; color=:light_cyan)
+        print(io, " <!DOCTYPE ", o.value, '>')
     elseif nt === Declaration
-        printstyled(io, " <?xml"; color=:light_cyan)
+        print(io, " <?xml")
         if !isnothing(o.attributes)
             for (k, v) in o.attributes
                 print(io, ' ', k, '=', '"', v, '"')
             end
         end
-        printstyled(io, "?>"; color=:light_cyan)
+        print(io, "?>")
     elseif nt === ProcessingInstruction
-        printstyled(io, " <?", o.tag; color=:light_cyan)
+        print(io, " <?", o.tag)
         !isnothing(o.value) && print(io, ' ', o.value)
-        printstyled(io, "?>"; color=:light_cyan)
+        print(io, "?>")
     elseif nt === Comment
-        printstyled(io, " <!--"; color=:light_cyan)
-        printstyled(io, o.value; color=:light_black)
-        printstyled(io, "-->"; color=:light_cyan)
+        print(io, " <!--", o.value, "-->")
     elseif nt === CData
-        printstyled(io, " <![CDATA["; color=:light_cyan)
-        printstyled(io, o.value; color=:light_black)
-        printstyled(io, "]]>"; color=:light_cyan)
+        print(io, " <![CDATA[", o.value, "]]>")
     elseif nt === Document
         n = length(children(o))
-        n > 0 && printstyled(io, n == 1 ? " (1 child)" : " ($n children)"; color=:light_black)
+        n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
     end
 end
 
diff --git a/src/lazynode.jl b/src/lazynode.jl
index 1f22513..c9b40d2 100644
--- a/src/lazynode.jl
+++ b/src/lazynode.jl
@@ -228,47 +228,40 @@ Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode)
 #-----------------------------------------------------------------------------# show
 function Base.show(io::IO, n::LazyNode)
     nt = n.nodetype
-    printstyled(io, "Lazy "; color=:light_black)
-    printstyled(io, nt; color=:light_green)
+    print(io, "Lazy ", nt)
     if nt === Text
-        printstyled(io, ' ', repr(value(n)))
+        print(io, ' ', repr(value(n)))
     elseif nt === Element
-        printstyled(io, " <", tag(n); color=:light_cyan)
+        print(io, " <", tag(n))
         attrs = attributes(n)
         if !isnothing(attrs)
             for (k, v) in attrs
                 print(io, ' ', k, '=', '"', v, '"')
             end
         end
-        printstyled(io, '>'; color=:light_cyan)
+        print(io, '>')
     elseif nt === DTD
-        printstyled(io, " <!DOCTYPE "; color=:light_cyan)
-        printstyled(io, value(n); color=:light_black)
-        printstyled(io, '>'; color=:light_cyan)
+        print(io, " <!DOCTYPE ", value(n), '>')
     elseif nt === Declaration
-        printstyled(io, " <?xml"; color=:light_cyan)
+        print(io, " <?xml")
         attrs = attributes(n)
         if !isnothing(attrs)
             for (k, v) in attrs
                 print(io, ' ', k, '=', '"', v, '"')
             end
         end
-        printstyled(io, "?>"; color=:light_cyan)
+        print(io, "?>")
     elseif nt === ProcessingInstruction
-        printstyled(io, " <?", tag(n); color=:light_cyan)
+        print(io, " <?", tag(n))
         v = value(n)
         !isnothing(v) && print(io, ' ', v)
-        printstyled(io, "?>"; color=:light_cyan)
+        print(io, "?>")
     elseif nt === Comment
-        printstyled(io, " <!--"; color=:light_cyan)
-        printstyled(io, value(n); color=:light_black)
-        printstyled(io, "-->"; color=:light_cyan)
+        print(io, " <!--", value(n), "-->")
     elseif nt === CData
-        printstyled(io, " <![CDATA["; color=:light_cyan)
-        printstyled(io, value(n); color=:light_black)
-        printstyled(io, "]]>"; color=:light_cyan)
+        print(io, " <![CDATA[", value(n), "]]>")
     elseif nt === Document
         n_ch = length(children(n))
-        n_ch > 0 && printstyled(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)"; color=:light_black)
+        n_ch > 0 && print(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)")
     end
 end

From 4a728eea43a3815a483d03f13400b6589eae226f Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Mon, 9 Mar 2026 10:49:36 -0400
Subject: [PATCH 17/18] Revamp benchmarks and expand test suite

---
 benchmarks/Project.toml          |   1 +
 benchmarks/benchmarks.jl         |  58 ++--
 benchmarks/benchmarks_results.md |  60 ++++
 benchmarks/compare.jl            | 117 ++++----
 benchmarks/compare_results.md    |  54 ++++
 test/runtests.jl                 | 477 +++++++++++++++++++++++++++++++
 6 files changed, 700 insertions(+), 67 deletions(-)
 create mode 100644 benchmarks/benchmarks_results.md
 create mode 100644 benchmarks/compare_results.md

diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
index 0598016..043988c 100644
--- a/benchmarks/Project.toml
+++ b/benchmarks/Project.toml
@@ -2,6 +2,7 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 XML = "72c71f33-b9b6-44de-8c94-c961784809e2"
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
index 539fce0..16096e8 100644
--- a/benchmarks/benchmarks.jl
+++ b/benchmarks/benchmarks.jl
@@ -5,7 +5,7 @@ using XMLDict: XMLDict
 using LightXML: LightXML
 using BenchmarkTools
 using DataFrames
-using UnicodePlots
+using InteractiveUtils
 
 include("XMarkGenerator.jl")
 using .XMarkGenerator
@@ -115,21 +115,47 @@ end
 @add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml))
 @add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o))
 
-#-----------------------------------------------------------------------------# Results
-function plot_group(df, kind)
-    g = groupby(df, :kind)
-    haskey(g, (;kind)) || return
-    sub = g[(;kind)]
-    x = map(row -> "$(row.name)", eachrow(sub))
-    y = map(x -> median(x).time / 1e6, sub.bench)
-    display(barplot(x, y, title = "$kind — median time (ms)", border=:none, width=50))
-    println()
-end
+#-----------------------------------------------------------------------------# Write benchmarks_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
 
-println("\n", "="^60)
-println("  BENCHMARK RESULTS")
-println("="^60, "\n")
+function _compare_indicator(xml_ms, other_ms)
+    ratio = xml_ms / other_ms
+    pct = abs(round((ratio - 1) * 100, digits=1))
+    ratio > 1.05 ? "(XML.jl $(pct)% slower)" : ratio < 0.95 ? "(XML.jl $(pct)% faster)" : "(~same)"
+end
 
-for kind in unique(df.kind)
-    plot_group(df, kind)
+outfile = joinpath(@__DIR__, "benchmarks_results.md")
+open(outfile, "w") do io
+    println(io, "# XML.jl Benchmarks\n")
+    println(io, "```")
+    for kind in unique(df.kind)
+        g = groupby(df, :kind)
+        haskey(g, (;kind)) || continue
+        sub = g[(;kind)]
+        println(io, kind)
+        # Find XML.jl baseline (first row starting with "XML.jl")
+        xml_row = findfirst(r -> startswith(r.name, "XML.jl") && !contains(r.name, "(SS)"), eachrow(sub))
+        xml_ms = isnothing(xml_row) ? nothing : median(sub[xml_row, :bench]).time / 1e6
+        for row in eachrow(sub)
+            ms = median(row.bench).time / 1e6
+            indicator = ""
+            if !isnothing(xml_ms) && !startswith(row.name, "XML.jl")
+                indicator = "  " * _compare_indicator(xml_ms, ms)
+            end
+            println(io, "\t", rpad(row.name, 16), lpad(_fmt_ms(ms), 12), indicator)
+        end
+        println(io)
+    end
+    println(io, "```")
+
+    println(io, "\n```julia")
+    println(io, "versioninfo()")
+    buf = IOBuffer()
+    InteractiveUtils.versioninfo(buf)
+    for line in eachline(IOBuffer(take!(buf)))
+        println(io, "# ", line)
+    end
+    println(io, "```")
 end
+
+println("Results written to $outfile")
diff --git a/benchmarks/benchmarks_results.md b/benchmarks/benchmarks_results.md
new file mode 100644
index 0000000..d03ae50
--- /dev/null
+++ b/benchmarks/benchmarks_results.md
@@ -0,0 +1,60 @@
+# XML.jl Benchmarks
+
+```
+Parse (small)
+	XML.jl             0.0339 ms
+	XML.jl (SS)        0.0301 ms
+	EzXML              0.0254 ms  (33.5% slower)
+	LightXML           0.0282 ms  (20.1% slower)
+	XMLDict             0.204 ms  (83.4% faster)
+
+Parse (medium)
+	XML.jl              170.0 ms
+	XML.jl (SS)         154.0 ms
+	EzXML                91.0 ms  (87.0% slower)
+	LightXML             92.8 ms  (83.4% slower)
+	XMLDict             623.0 ms  (72.7% faster)
+
+Write (small)
+	XML.jl             0.0179 ms
+	EzXML              0.0107 ms  (68.0% slower)
+	LightXML           0.0926 ms  (80.6% faster)
+
+Write (medium)
+	XML.jl               81.2 ms
+	EzXML                73.2 ms  (11.0% slower)
+	LightXML             55.1 ms  (47.5% slower)
+
+Read file
+	XML.jl              180.0 ms
+	EzXML               129.0 ms  (39.9% slower)
+	LightXML            104.0 ms  (73.4% slower)
+
+Collect tags (small)
+	XML.jl           0.000597 ms
+	EzXML             0.00219 ms  (72.7% faster)
+	LightXML          0.00371 ms  (83.9% faster)
+
+Collect tags (medium)
+	XML.jl               12.2 ms
+	EzXML                28.2 ms  (56.9% faster)
+	LightXML             25.7 ms  (52.6% faster)
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+#   Official https://julialang.org release
+# Platform Info:
+#   OS: macOS (arm64-apple-darwin24.0.0)
+#   CPU: 10 × Apple M1 Pro
+#   WORD_SIZE: 64
+#   LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+#   GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+#   JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl
index 979b092..7dca16e 100644
--- a/benchmarks/compare.jl
+++ b/benchmarks/compare.jl
@@ -12,7 +12,7 @@ This script:
 4. Prints a side-by-side comparison
 =#
 
-using BenchmarkTools, Serialization
+using BenchmarkTools, Serialization, InteractiveUtils
 
 BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
 BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
@@ -71,18 +71,25 @@ using XML
 
 dev_results = Dict{String, BenchmarkTools.Trial}()
 
+const SSNode = Node{SubString{String}}
+
 dev_small = parse(SMALL_XML, Node)
+dev_small_ss = parse(SMALL_XML, SSNode)
 dev_medium = parse(MEDIUM_XML, Node)
+dev_medium_ss = parse(MEDIUM_XML, SSNode)
 
-dev_results["Parse (small)"] = @benchmark parse($SMALL_XML, Node)
-dev_results["Parse (small, SS)"] = @benchmark parse($SMALL_XML, Node{SubString{String}})
-dev_results["Parse (medium)"] = @benchmark parse($MEDIUM_XML, Node)
-dev_results["Parse (medium, SS)"] = @benchmark parse($MEDIUM_XML, Node{SubString{String}})
+dev_results["Parse (small), String"] = @benchmark parse($SMALL_XML, Node)
+dev_results["Parse (small), SubString"] = @benchmark parse($SMALL_XML, SSNode)
+dev_results["Parse (medium), String"] = @benchmark parse($MEDIUM_XML, Node)
+dev_results["Parse (medium), SubString"] = @benchmark parse($MEDIUM_XML, SSNode)
 dev_results["Write (small)"] = @benchmark XML.write($dev_small)
 dev_results["Write (medium)"] = @benchmark XML.write($dev_medium)
-dev_results["Read file (medium)"] = @benchmark read($MEDIUM_FILE, Node)
-dev_results["Collect tags (small)"] = @benchmark bench_collect_tags($dev_small)
-dev_results["Collect tags (medium)"] = @benchmark bench_collect_tags($dev_medium)
+dev_results["Read file (medium), String"] = @benchmark read($MEDIUM_FILE, Node)
+dev_results["Read file (medium), SubString"] = @benchmark parse(read($MEDIUM_FILE, String), SSNode)
+dev_results["Collect tags (small), String"] = @benchmark bench_collect_tags($dev_small)
+dev_results["Collect tags (small), SubString"] = @benchmark bench_collect_tags($dev_small_ss)
+dev_results["Collect tags (medium), String"] = @benchmark bench_collect_tags($dev_medium)
+dev_results["Collect tags (medium), SubString"] = @benchmark bench_collect_tags($dev_medium_ss)
 
 println(" done")
 
@@ -158,52 +165,60 @@ println(" done")
 # Cleanup worktree
 run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull))
 
-#-----------------------------------------------------------------------------# Compare
-println()
-println("-"^60)
-
-all_keys = [
-    "Parse (small)", "Parse (small, SS)",
-    "Parse (medium)", "Parse (medium, SS)",
-    "Write (small)", "Write (medium)",
-    "Read file (medium)",
-    "Collect tags (small)", "Collect tags (medium)",
+#-----------------------------------------------------------------------------# Write compare_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
+
+function _compare_indicator(dev_ms, rel_ms)
+    change = (dev_ms / rel_ms - 1) * 100
+    pct = abs(round(change, digits=1))
+    change < -5 ? "($(pct)% faster)" : change > 5 ? "($(pct)% slower)" : "(~same)"
+end
+
+groups = [
+    ("Parse (small)",        "Parse (small)",        ["Parse (small), String", "Parse (small), SubString"]),
+    ("Parse (medium)",       "Parse (medium)",       ["Parse (medium), String", "Parse (medium), SubString"]),
+    ("Write (small)",        "Write (small)",        ["Write (small)"]),
+    ("Write (medium)",       "Write (medium)",       ["Write (medium)"]),
+    ("Read file (medium)",   "Read file (medium)",   ["Read file (medium), String", "Read file (medium), SubString"]),
+    ("Collect tags (small)", "Collect tags (small)",  ["Collect tags (small), String", "Collect tags (small), SubString"]),
+    ("Collect tags (medium)","Collect tags (medium)", ["Collect tags (medium), String", "Collect tags (medium), SubString"]),
 ]
 
-for name in all_keys
-    has_dev = haskey(dev_results, name)
-    has_rel = haskey(release_results, name)
-    has_dev || has_rel || continue
-
-    println()
-    println("  $name")
-
-    if has_dev && has_rel
-        dev_med = median(dev_results[name]).time / 1e6
-        rel_med = median(release_results[name]).time / 1e6
-        change = (dev_med / rel_med - 1) * 100
-
-        pct = abs(round(change, digits=1))
-        indicator = if change < -5
-            "$(pct)% faster"
-        elseif change > 5
-            "$(pct)% slower"
-        else
-            "~same"
+outfile = joinpath(@__DIR__, "compare_results.md")
+open(outfile, "w") do io
+    println(io, "# XML.jl Benchmark Comparison: dev vs $RELEASE_TAG\n")
+    println(io, "```")
+    for (title, rel_key, dev_keys) in groups
+        rel_ms = haskey(release_results, rel_key) ? median(release_results[rel_key]).time / 1e6 : nothing
+        any(k -> haskey(dev_results, k), dev_keys) || (isnothing(rel_ms) && continue)
+
+        println(io, title)
+        if !isnothing(rel_ms)
+            println(io, "\t", rpad(RELEASE_TAG, 16), lpad(_fmt_ms(rel_ms), 12))
         end
-
-        lpad_tag = lpad(RELEASE_TAG, 12)
-        lpad_dev = lpad("dev", 12)
-        println("    $lpad_tag  $(lpad(string(round(rel_med, digits=4), " ms"), 12))")
-        println("    $lpad_dev  $(lpad(string(round(dev_med, digits=4), " ms"), 12))  ($indicator)")
-    elseif has_dev
-        dev_med = median(dev_results[name]).time / 1e6
-        lpad_tag = lpad(RELEASE_TAG, 12)
-        lpad_dev = lpad("dev", 12)
-        println("    $lpad_tag  $(lpad("n/a", 12))")
-        println("    $lpad_dev  $(lpad(string(round(dev_med, digits=4), " ms"), 12))")
+        for dk in dev_keys
+            haskey(dev_results, dk) || continue
+            dev_ms = median(dev_results[dk]).time / 1e6
+            label = occursin(", ", dk) ? split(dk, ", "; limit=2)[2] : "dev"
+            ms_str = lpad(_fmt_ms(dev_ms), 12)
+            if isnothing(rel_ms)
+                println(io, "\t", rpad(label, 16), ms_str)
+            else
+                println(io, "\t", rpad(label, 16), ms_str, "  ", _compare_indicator(dev_ms, rel_ms))
+            end
+        end
+        println(io)
+    end
+    println(io, "```")
+
+    println(io, "\n```julia")
+    println(io, "versioninfo()")
+    buf = IOBuffer()
+    InteractiveUtils.versioninfo(buf)
+    for line in eachline(IOBuffer(take!(buf)))
+        println(io, "# ", line)
     end
+    println(io, "```")
 end
 
-println()
-println("="^60)
+println("Results written to $outfile")
diff --git a/benchmarks/compare_results.md b/benchmarks/compare_results.md
new file mode 100644
index 0000000..135035e
--- /dev/null
+++ b/benchmarks/compare_results.md
@@ -0,0 +1,54 @@
+# XML.jl Benchmark Comparison: dev vs v0.3.8
+
+```
+Parse (small)
+	v0.3.8              0.116 ms
+	String             0.0351 ms  (69.7% faster)
+	SubString          0.0309 ms  (73.4% faster)
+
+Parse (medium)
+	v0.3.8              736.0 ms
+	String              170.0 ms  (76.9% faster)
+	SubString           162.0 ms  (77.9% faster)
+
+Write (small)
+	v0.3.8             0.0257 ms
+	dev                0.0212 ms  (17.7% faster)
+
+Write (medium)
+	v0.3.8              154.0 ms
+	dev                  84.6 ms  (44.9% faster)
+
+Read file (medium)
+	v0.3.8              714.0 ms
+	String              177.0 ms  (75.2% faster)
+	SubString           171.0 ms  (76.1% faster)
+
+Collect tags (small)
+	v0.3.8           0.000527 ms
+	String           0.000614 ms  (16.5% slower)
+	SubString         0.00177 ms  (235.1% slower)
+
+Collect tags (medium)
+	v0.3.8               25.0 ms
+	String               10.9 ms  (56.4% faster)
+	SubString            16.0 ms  (36.0% faster)
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+#   Official https://julialang.org release
+# Platform Info:
+#   OS: macOS (arm64-apple-darwin24.0.0)
+#   CPU: 10 × Apple M1 Pro
+#   WORD_SIZE: 64
+#   LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+#   GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+#   JULIA_NUM_THREADS = auto
+```
diff --git a/test/runtests.jl b/test/runtests.jl
index 148488b..c3419ce 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2486,6 +2486,46 @@ end
     end
 end
 
+#==============================================================================#
+#                        DEPRECATIONS / REMOVED API                            #
+#==============================================================================#
+@testset "Deprecations and Removed API" begin
+    node = Element("test")
+    node2 = Element("other")
+
+    @testset "XML.next errors" begin
+        @test_throws ErrorException XML.next(node)
+    end
+
+    @testset "XML.prev errors" begin
+        @test_throws ErrorException XML.prev(node)
+    end
+
+    @testset "XML.nodes_equal errors" begin
+        @test_throws ErrorException XML.nodes_equal(node, node2)
+    end
+
+    @testset "XML.escape! errors" begin
+        @test_throws ErrorException XML.escape!(node)
+        @test_throws ErrorException XML.escape!(node, false)
+    end
+
+    @testset "XML.unescape! errors" begin
+        @test_throws ErrorException XML.unescape!(node)
+        @test_throws ErrorException XML.unescape!(node, false)
+    end
+
+    @testset "XML.Raw errors" begin
+        @test_throws ErrorException XML.Raw()
+        @test_throws ErrorException XML.Raw("arg")
+    end
+
+    @testset "simplevalue binding redirects to simple_value" begin
+        el = Element("x", "val")
+        @test XML.simplevalue(el) == simple_value(el)
+    end
+end
+
 #==============================================================================#
 #                              XPATH                                           #
 #==============================================================================#
@@ -2726,6 +2766,443 @@ end
     end
 end
 
+#==============================================================================#
+#                              LAZYNODE                                        #
+#==============================================================================#
+@testset "LazyNode" begin
+    @testset "parse and nodetype" begin
+        doc = parse("<root/>", LazyNode)
+        @test nodetype(doc) == Document
+
+        doc2 = parse(LazyNode, "<root/>")
+        @test nodetype(doc2) == Document
+    end
+
+    @testset "read from IO" begin
+        xml = """<?xml version="1.0"?><root>hello</root>"""
+        doc = read(IOBuffer(xml), LazyNode)
+        @test nodetype(doc) == Document
+    end
+
+    @testset "read from file" begin
+        path = joinpath(@__DIR__, "data", "books.xml")
+        isfile(path) || return
+        doc = read(path, LazyNode)
+        @test nodetype(doc) == Document
+        @test length(children(doc)) > 0
+    end
+
+    @testset "Document children" begin
+        xml = """<?xml version="1.0"?><root><child/></root>"""
+        doc = parse(xml, LazyNode)
+        ch = children(doc)
+        @test length(ch) == 2
+        @test nodetype(ch[1]) == Declaration
+        @test nodetype(ch[2]) == Element
+    end
+
+    @testset "Document with all prolog node types" begin
+        xml = """<?xml version="1.0"?><!DOCTYPE root SYSTEM "r.dtd"><!-- comment --><?pi data?><root/>"""
+        doc = parse(xml, LazyNode)
+        ch = children(doc)
+        types = map(nodetype, ch)
+        @test Declaration in types
+        @test DTD in types
+        @test Comment in types
+        @test ProcessingInstruction in types
+        @test Element in types
+    end
+
+    @testset "Element tag" begin
+        doc = parse("<root/>", LazyNode)
+        @test tag(doc[1]) == "root"
+    end
+
+    @testset "tag returns nothing for non-element/PI" begin
+        doc = parse("<root>text</root>", LazyNode)
+        text_node = children(doc[1])[1]
+        @test nodetype(text_node) == Text
+        @test tag(text_node) === nothing
+    end
+
+    @testset "Element attributes" begin
+        doc = parse("""<root a="1" b="2"/>""", LazyNode)
+        attrs = attributes(doc[1])
+        @test attrs isa Dict
+        @test attrs["a"] == "1"
+        @test attrs["b"] == "2"
+    end
+
+    @testset "Element with no attributes" begin
+        doc = parse("<root/>", LazyNode)
+        @test attributes(doc[1]) === nothing
+    end
+
+    @testset "attributes returns nothing for non-element" begin
+        doc = parse("<root>text</root>", LazyNode)
+        @test attributes(children(doc[1])[1]) === nothing
+    end
+
+    @testset "attributes unescape entity references" begin
+        doc = parse("""<x a="a&amp;b"/>""", LazyNode)
+        @test doc[1]["a"] == "a&b"
+    end
+
+    @testset "Declaration attributes" begin
+        doc = parse("""<?xml version="1.0" encoding="UTF-8"?><root/>""", LazyNode)
+        decl = doc[1]
+        @test nodetype(decl) == Declaration
+        attrs = attributes(decl)
+        @test attrs["version"] == "1.0"
+        @test attrs["encoding"] == "UTF-8"
+    end
+
+    @testset "get with default" begin
+        doc = parse("""<x a="1"/>""", LazyNode)
+        el = doc[1]
+        @test get(el, "a", "nope") == "1"
+        @test get(el, "b", "nope") == "nope"
+    end
+
+    @testset "get on non-element returns default" begin
+        doc = parse("<root>text</root>", LazyNode)
+        text_node = children(doc[1])[1]
+        @test get(text_node, "a", "default") == "default"
+    end
+
+    @testset "getindex with string key" begin
+        doc = parse("""<x a="1"/>""", LazyNode)
+        @test doc[1]["a"] == "1"
+        @test_throws KeyError doc[1]["nonexistent"]
+    end
+
+    @testset "haskey" begin
+        doc = parse("""<x a="1"/>""", LazyNode)
+        @test haskey(doc[1], "a") == true
+        @test haskey(doc[1], "b") == false
+    end
+
+    @testset "keys" begin
+        doc = parse("""<x a="1" b="2"/>""", LazyNode)
+        @test keys(doc[1]) == ["a", "b"]
+    end
+
+    @testset "keys on element with no attributes" begin
+        doc = parse("<x/>", LazyNode)
+        @test isempty(keys(doc[1]))
+    end
+
+    @testset "keys on non-element" begin
+        doc = parse("<root>text</root>", LazyNode)
+        @test keys(children(doc[1])[1]) == ()
+    end
+
+    @testset "Text value" begin
+        doc = parse("<root>hello</root>", LazyNode)
+        ch = children(doc[1])
+        @test nodetype(ch[1]) == Text
+        @test value(ch[1]) == "hello"
+    end
+
+    @testset "Text value unescapes entities" begin
+        doc = parse("<root>&amp; &lt; &gt;</root>", LazyNode)
+        @test value(children(doc[1])[1]) == "& < >"
+    end
+
+    @testset "Comment value" begin
+        doc = parse("<root><!-- a comment --></root>", LazyNode)
+        c = children(doc[1])[1]
+        @test nodetype(c) == Comment
+        @test value(c) == " a comment "
+    end
+
+    @testset "CData value" begin
+        doc = parse("<root><![CDATA[raw <data>]]></root>", LazyNode)
+        cd = children(doc[1])[1]
+        @test nodetype(cd) == CData
+        @test value(cd) == "raw <data>"
+    end
+
+    @testset "DTD value" begin
+        doc = parse("""<!DOCTYPE greeting SYSTEM "hello.dtd"><greeting/>""", LazyNode)
+        dtd = doc[1]
+        @test nodetype(dtd) == DTD
+        @test contains(value(dtd), "greeting")
+    end
+
+    @testset "ProcessingInstruction tag and value" begin
+        doc = parse("<?mypi some data?><root/>", LazyNode)
+        pi = doc[1]
+        @test nodetype(pi) == ProcessingInstruction
+        @test tag(pi) == "mypi"
+        @test value(pi) == "some data"
+    end
+
+    @testset "ProcessingInstruction with no content" begin
+        doc = parse("<?target?><root/>", LazyNode)
+        pi = doc[1]
+        @test tag(pi) == "target"
+        @test value(pi) === nothing
+    end
+
+    @testset "value returns nothing for Element/Document" begin
+        doc = parse("<root/>", LazyNode)
+        @test value(doc) === nothing
+        @test value(doc[1]) === nothing
+    end
+
+    @testset "Element children" begin
+        doc = parse("<root><a/><b/><c/></root>", LazyNode)
+        root = doc[1]
+        @test length(children(root)) == 3
+        @test tag(children(root)[1]) == "a"
+        @test tag(children(root)[2]) == "b"
+        @test tag(children(root)[3]) == "c"
+    end
+
+    @testset "self-closing element has no children" begin
+        doc = parse("<root><br/></root>", LazyNode)
+        br = children(doc[1])[1]
+        @test isempty(children(br))
+    end
+
+    @testset "non-element children returns empty tuple" begin
+        doc = parse("<root>text</root>", LazyNode)
+        text_node = children(doc[1])[1]
+        @test children(text_node) == ()
+    end
+
+    @testset "nested elements" begin
+        doc = parse("<a><b><c>deep</c></b></a>", LazyNode)
+        @test tag(doc[1]) == "a"
+        @test tag(doc[1][1]) == "b"
+        @test tag(doc[1][1][1]) == "c"
+        @test simple_value(doc[1][1][1]) == "deep"
+    end
+
+    @testset "mixed content children" begin
+        xml = "<root>text<!-- comment --><![CDATA[cdata]]><?pi data?><child/></root>"
+        doc = parse(xml, LazyNode)
+        ch = children(doc[1])
+        types = map(nodetype, ch)
+        @test Text in types
+        @test Comment in types
+        @test CData in types
+        @test ProcessingInstruction in types
+        @test Element in types
+    end
+
+    @testset "integer indexing" begin
+        doc = parse("<root><a/><b/><c/></root>", LazyNode)
+        @test tag(doc[1][1]) == "a"
+        @test tag(doc[1][2]) == "b"
+        @test tag(doc[1][3]) == "c"
+    end
+
+    @testset "colon indexing" begin
+        doc = parse("<root><a/><b/></root>", LazyNode)
+        all = doc[1][:]
+        @test length(all) == 2
+    end
+
+    @testset "lastindex" begin
+        doc = parse("<root><a/><b/><c/></root>", LazyNode)
+        @test tag(doc[1][end]) == "c"
+    end
+
+    @testset "only" begin
+        doc = parse("<root><only/></root>", LazyNode)
+        @test tag(only(doc[1])) == "only"
+    end
+
+    @testset "length" begin
+        doc = parse("<root><a/><b/><c/></root>", LazyNode)
+        @test length(doc[1]) == 3
+    end
+
+    @testset "is_simple" begin
+        doc = parse("<root><simple>text</simple><complex><child/></complex></root>", LazyNode)
+        simple = children(doc[1])[1]
+        complex = children(doc[1])[2]
+        @test is_simple(simple)
+        @test !is_simple(complex)
+    end
+
+    @testset "is_simple with attributes" begin
+        doc = parse("""<root><x a="1">text</x></root>""", LazyNode)
+        @test !is_simple(children(doc[1])[1])
+    end
+
+    @testset "is_simple with CData child" begin
+        doc = parse("<root><x><![CDATA[data]]></x></root>", LazyNode)
+        @test is_simple(children(doc[1])[1])
+    end
+
+    @testset "is_simple returns false for non-element" begin
+        doc = parse("<root>text</root>", LazyNode)
+        @test !is_simple(children(doc[1])[1])
+    end
+
+    @testset "simple_value" begin
+        doc = parse("<root><x>hello</x></root>", LazyNode)
+        @test simple_value(children(doc[1])[1]) == "hello"
+    end
+
+    @testset "simple_value errors on non-simple" begin
+        doc = parse("<root><x><y/></x></root>", LazyNode)
+        @test_throws ErrorException simple_value(children(doc[1])[1])
+    end
+
+    @testset "simple_value errors on non-element" begin
+        doc = parse("<root>text</root>", LazyNode)
+        @test_throws ErrorException simple_value(children(doc[1])[1])
+    end
+
+    @testset "show Document" begin
+        doc = parse("<root><a/></root>", LazyNode)
+        s = sprint(show, doc)
+        @test contains(s, "Lazy")
+        @test contains(s, "Document")
+        @test contains(s, "1 child")
+    end
+
+    @testset "show Document multiple children" begin
+        doc = parse("<!-- c --><root/>", LazyNode)
+        s = sprint(show, doc)
+        @test contains(s, "2 children")
+    end
+
+    @testset "show Element" begin
+        doc = parse("""<root a="1"/>""", LazyNode)
+        s = sprint(show, doc[1])
+        @test contains(s, "Lazy Element")
+        @test contains(s, "<root")
+    end
+
+    @testset "show Text" begin
+        doc = parse("<root>hello</root>", LazyNode)
+        s = sprint(show, children(doc[1])[1])
+        @test contains(s, "Lazy Text")
+        @test contains(s, "hello")
+    end
+
+    @testset "show Comment" begin
+        doc = parse("<root><!-- test --></root>", LazyNode)
+        s = sprint(show, children(doc[1])[1])
+        @test contains(s, "Lazy Comment")
+        @test contains(s, "<!--")
+    end
+
+    @testset "show CData" begin
+        doc = parse("<root><![CDATA[data]]></root>", LazyNode)
+        s = sprint(show, children(doc[1])[1])
+        @test contains(s, "Lazy CData")
+        @test contains(s, "<![CDATA[")
+    end
+
+    @testset "show DTD" begin
+        doc = parse("<!DOCTYPE html><html/>", LazyNode)
+        s = sprint(show, doc[1])
+        @test contains(s, "Lazy DTD")
+        @test contains(s, "<!DOCTYPE")
+    end
+
+    @testset "show Declaration" begin
+        doc = parse("""<?xml version="1.0"?><root/>""", LazyNode)
+        s = sprint(show, doc[1])
+        @test contains(s, "Lazy Declaration")
+        @test contains(s, "<?xml")
+    end
+
+    @testset "show ProcessingInstruction" begin
+        doc = parse("<?target data?><root/>", LazyNode)
+        s = sprint(show, doc[1])
+        @test contains(s, "Lazy ProcessingInstruction")
+        @test contains(s, "<?target")
+    end
+
+    @testset "show ProcessingInstruction without content" begin
+        doc = parse("<?target?><root/>", LazyNode)
+        s = sprint(show, doc[1])
+        @test contains(s, "<?target?>")
+    end
+
+    @testset "LazyNode agrees with Node on books.xml" begin
+        path = joinpath(@__DIR__, "data", "books.xml")
+        isfile(path) || return
+
+        eager = read(path, Node)
+        lazy = read(path, LazyNode)
+
+        # Same top-level structure
+        eager_ch = children(eager)
+        lazy_ch = children(lazy)
+        @test length(eager_ch) == length(lazy_ch)
+        @test map(nodetype, eager_ch) == map(nodetype, lazy_ch)
+
+        # Find root element in both
+        eager_root = first(filter(x -> nodetype(x) == Element, eager_ch))
+        lazy_root = first(filter(x -> nodetype(x) == Element, lazy_ch))
+        @test tag(eager_root) == tag(lazy_root)
+
+        # Same number of book elements
+        eager_books = filter(x -> nodetype(x) == Element, children(eager_root))
+        lazy_books = filter(x -> nodetype(x) == Element, children(lazy_root))
+        @test length(eager_books) == length(lazy_books)
+
+        # First book has same attributes and child values
+        eb1 = eager_books[1]
+        lb1 = lazy_books[1]
+        @test eb1["id"] == lb1["id"]
+
+        eager_author = first(filter(x -> nodetype(x) == Element && tag(x) == "author", children(eb1)))
+        lazy_author = first(filter(x -> nodetype(x) == Element && tag(x) == "author", children(lb1)))
+        @test simple_value(eager_author) == simple_value(lazy_author)
+    end
+
+    @testset "complex document" begin
+        xml = """<?xml version="1.0"?>
+<!DOCTYPE root SYSTEM "root.dtd">
+<!-- comment -->
+<?pi data?>
+<root attr="val">
+    text content
+    <child>inner</child>
+    <![CDATA[cdata content]]>
+    <!-- inner comment -->
+    <?inner-pi inner data?>
+    <empty/>
+</root>"""
+        doc = parse(xml, LazyNode)
+        @test nodetype(doc) == Document
+
+        typed = filter(x -> nodetype(x) != Text, children(doc))
+        @test nodetype(typed[1]) == Declaration
+        @test nodetype(typed[2]) == DTD
+        @test nodetype(typed[3]) == Comment
+        @test nodetype(typed[4]) == ProcessingInstruction
+        @test nodetype(typed[5]) == Element
+
+        root = typed[5]
+        @test tag(root) == "root"
+        @test root["attr"] == "val"
+
+        inner = children(root)
+        inner_types = map(nodetype, inner)
+        @test Text in inner_types
+        @test Element in inner_types
+        @test CData in inner_types
+        @test Comment in inner_types
+        @test ProcessingInstruction in inner_types
+
+        child_els = filter(x -> nodetype(x) == Element, inner)
+        @test length(child_els) == 2
+        @test tag(child_els[1]) == "child"
+        @test simple_value(child_els[1]) == "inner"
+        @test tag(child_els[2]) == "empty"
+    end
+end
+
 include("test_pugixml.jl")
 include("test_libexpat.jl")
 include("test_w3c.jl")

From 2f71f9a56a59fcb0e8638cd0ac38b8df17c4c0b7 Mon Sep 17 00:00:00 2001
From: Josh Day <emailjoshday@gmail.com>
Date: Thu, 2 Apr 2026 16:49:15 -0400
Subject: [PATCH 18/18] Add Attributes type and performance optimizations

---
 benchmarks/dict_benchmarks.jl | 71 +++++++++++++++++++++++++++++++++++
 src/XML.jl                    | 66 ++++++++++++++++++++++++--------
 src/XMLTokenizer.jl           | 40 ++++++++++----------
 src/lazynode.jl               |  2 +-
 src/xpath.jl                  |  7 +++-
 test/runtests.jl              |  4 +-
 6 files changed, 150 insertions(+), 40 deletions(-)
 create mode 100644 benchmarks/dict_benchmarks.jl

diff --git a/benchmarks/dict_benchmarks.jl b/benchmarks/dict_benchmarks.jl
new file mode 100644
index 0000000..7dd90a3
--- /dev/null
+++ b/benchmarks/dict_benchmarks.jl
@@ -0,0 +1,71 @@
+using XML
+using BenchmarkTools
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+
+#-----------------------------------------------------------------------------# Setup
+sizes = [2, 5, 10, 20]
+
+function make_xml(n::Int)
+    attrs = join((" attr$i=\"value$i\"" for i in 1:n))
+    "<root$attrs/>"
+end
+
+function make_pairs(n::Int)
+    Pair{String,String}["attr$i" => "value$i" for i in 1:n]
+end
+
+pt(t) = BenchmarkTools.prettytime(t)
+
+function printrow(n, op, t_dict, t_attr)
+    pct = round(100 * (t_dict - t_attr) / t_dict, digits=1)
+    label = pct > 0 ? "$(pct)% faster" : "$(-pct)% slower"
+    println(rpad("$n attrs", 10), " | ", rpad(op, 22), " | ",
+            rpad("Dict $(pt(t_dict))", 22), " | ",
+            rpad("Attributes $(pt(t_attr))", 26), " | ", label)
+end
+
+#-----------------------------------------------------------------------------# Benchmarks
+println("=" ^ 110)
+println("  Attributes vs Dict Benchmarks")
+println("=" ^ 110)
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | ",
+        rpad("Dict", 22), " | ", rpad("Attributes", 26), " | Change")
+println("-" ^ 110)
+
+for n in sizes
+    pairs = make_pairs(n)
+    d = Dict(pairs)
+    a = XML.Attributes(pairs)
+    key_mid = "attr$(n ÷ 2 + 1)"
+    key_last = "attr$n"
+
+    tests = [
+        ("construct",       () -> @benchmark(Dict($pairs)),               () -> @benchmark(XML.Attributes($pairs))),
+        ("getindex [mid]",  () -> @benchmark($d[$key_mid]),               () -> @benchmark($a[$key_mid])),
+        ("getindex [last]", () -> @benchmark($d[$key_last]),              () -> @benchmark($a[$key_last])),
+        ("get [miss]",      () -> @benchmark(get($d, "nope", nothing)),   () -> @benchmark(get($a, "nope", nothing))),
+        ("haskey [hit]",    () -> @benchmark(haskey($d, $key_mid)),       () -> @benchmark(haskey($a, $key_mid))),
+        ("keys",            () -> @benchmark(collect(keys($d))),          () -> @benchmark(keys($a))),
+        ("iterate",         () -> @benchmark(sum(length(v) for (_,v) in $d)), () -> @benchmark(sum(length(v) for (_,v) in $a))),
+    ]
+
+    for (op, bench_dict, bench_attr) in tests
+        t_dict = median(bench_dict()).time
+        t_attr = median(bench_attr()).time
+        printrow(n, op, t_dict, t_attr)
+    end
+    println("-" ^ 110)
+end
+
+#-----------------------------------------------------------------------------# End-to-end: attributes() call on parsed Node
+println()
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | Time")
+println("-" ^ 50)
+for n in sizes
+    doc = parse(make_xml(n), Node)
+    el = doc[1]
+    t = median(@benchmark(attributes($el))).time
+    println(rpad("$n attrs", 10), " | ", rpad("attributes(node)", 22), " | ", pt(t))
+end
+println()
diff --git a/src/XML.jl b/src/XML.jl
index 67f6903..f36905f 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -1,7 +1,7 @@
 module XML
 
 export
-    Node, LazyNode, NodeType,
+    Node, LazyNode, NodeType, Attributes,
     CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
     nodetype, tag, attributes, value, children,
     is_simple, simple_value,
@@ -53,7 +53,8 @@ exactly once (no double-unescaping).
 """
 function unescape(x::AbstractString)
     occursin('&', x) || return string(x)
-    s = replace(string(x), r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref)
+    s = string(x)
+    occursin("&#", s) && (s = replace(s, r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref))
     replace(s, "&lt;" => "<", "&gt;" => ">", "&apos;" => "'", "&quot;" => "\"", "&amp;" => "&")
 end
 
@@ -82,6 +83,42 @@ NodeTypes can be used to construct XML.Nodes:
 """
 @enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text
 
+#-----------------------------------------------------------------------------# Attributes
+"""
+    Attributes{S} <: AbstractDict{S, S}
+
+An ordered dictionary of XML attributes backed by a `Vector{Pair{S, S}}`.
+Returned by [`attributes`](@ref).  Preserves insertion order and supports the
+full `AbstractDict` interface (`get`, `haskey`, `keys`, `values`, iteration, etc.).
+"""
+struct Attributes{S} <: AbstractDict{S, S}
+    entries::Vector{Pair{S, S}}
+end
+
+Base.length(a::Attributes) = length(a.entries)
+Base.iterate(a::Attributes, state...) = iterate(a.entries, state...)
+
+function Base.getindex(a::Attributes, key::AbstractString)
+    for (k, v) in a.entries
+        k == key && return v
+    end
+    throw(KeyError(key))
+end
+
+function Base.get(a::Attributes, key::AbstractString, default)
+    for (k, v) in a.entries
+        k == key && return v
+    end
+    default
+end
+
+function Base.haskey(a::Attributes, key::AbstractString)
+    any(p -> first(p) == key, a.entries)
+end
+
+Base.keys(a::Attributes) = first.(a.entries)
+Base.values(a::Attributes) = last.(a.entries)
+
 #-----------------------------------------------------------------------------# Node
 struct Node{S}
     nodetype::NodeType
@@ -116,17 +153,17 @@ nodetype(o::Node) = o.nodetype
 tag(o::Node) = o.tag
 
 """
-    attributes(node::Node) -> Union{Nothing, Dict{String, String}}
+    attributes(node::Node) -> Union{Nothing, Attributes{String}}
 
-Return the attributes of an `Element` or `Declaration` node as a `Dict`, or `nothing` if the
-node has no attributes.
+Return the attributes of an `Element` or `Declaration` node as an [`Attributes`](@ref) dict,
+or `nothing` if the node has no attributes.
 
 !!! note "Changed in v0.4"
     In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl.
-    It now returns a standard `Dict`.  Attribute order is preserved internally but not exposed
-    by this function.  Use `node["key"]` for key-based access and `keys(node)` for ordered keys.
+    It now returns an [`Attributes`](@ref), an ordered `AbstractDict` backed by a
+    `Vector{Pair}`.
 """
-attributes(o::Node) = isnothing(o.attributes) ? nothing : Dict(o.attributes)
+attributes(o::Node) = isnothing(o.attributes) ? nothing : Attributes(o.attributes)
 
 value(o::Node) = o.value
 children(o::Node) = something(o.children, ())
@@ -492,9 +529,11 @@ end
 _to(::Type{String}, s::AbstractString) = String(s)
 _to(::Type{SubString{String}}, s::SubString{String}) = s
 
+_nothingify(v::Vector) = isempty(v) ? nothing : v
+
 function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
     tags = S[]
-    attrs_stack = Vector{Union{Nothing, Vector{Pair{S,S}}}}()
+    attrs_stack = Vector{Pair{S,S}}[]
     children_stack = Vector{Vector{Node{S}}}()
     push!(children_stack, Node{S}[])
 
@@ -512,14 +551,14 @@ function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
 
         elseif k === TOKEN_OPEN_TAG
             push!(tags, _to(S, tag_name(token)))
-            push!(attrs_stack, nothing)
+            push!(attrs_stack, Pair{S,S}[])
             push!(children_stack, Node{S}[])
 
         elseif k === TOKEN_SELF_CLOSE
             t = pop!(tags)
             a = pop!(attrs_stack)
             pop!(children_stack)
-            push!(last(children_stack), Node{S}(Element, t, a, nothing, nothing))
+            push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, nothing))
 
         elseif k === TOKEN_TAG_CLOSE
             in_close_tag && (in_close_tag = false)
@@ -531,7 +570,7 @@ function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
             t == close_name || error("Mismatched tags: expected </$t>, got </$close_name>.")
             a = pop!(attrs_stack)
             c = pop!(children_stack)
-            push!(last(children_stack), Node{S}(Element, t, a, nothing, isempty(c) ? nothing : c))
+            push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, isempty(c) ? nothing : c))
             in_close_tag = true
 
         elseif k === TOKEN_ATTR_NAME
@@ -544,9 +583,6 @@ function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
                 any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name")
                 push!(decl_attrs, name => val)
             elseif !isempty(attrs_stack)
-                if isnothing(last(attrs_stack))
-                    attrs_stack[end] = Pair{S,S}[]
-                end
                 any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name")
                 push!(last(attrs_stack), name => val)
             end
diff --git a/src/XMLTokenizer.jl b/src/XMLTokenizer.jl
index 9b05ef5..eee117b 100644
--- a/src/XMLTokenizer.jl
+++ b/src/XMLTokenizer.jl
@@ -138,16 +138,16 @@ end
 end
 
 @inline function _skip_whitespace(data, pos)
-    while !_iseof(data, pos) && _is_whitespace(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_whitespace(_peek(data, pos))
         pos += 1
     end
     pos
 end
 
 function _skip_quoted(data, pos)
-    q = _peek(data, pos)
+    q = @inbounds _peek(data, pos)
     pos += 1
-    while !_iseof(data, pos)
+    @inbounds while !_iseof(data, pos)
         _peek(data, pos) == q && return pos + 1
         pos += 1
     end
@@ -187,7 +187,7 @@ end
 #-----------------------------------------------------------------------# S_DEFAULT tokens
 function _read_text(data, pos)
     start = pos
-    while !_iseof(data, pos) && _peek(data, pos) != UInt8('<')
+    @inbounds while !_iseof(data, pos) && _peek(data, pos) != UInt8('<')
         pos += 1
     end
     tok = Token(TOKEN_TEXT, @inbounds SubString(data, start, prevind(data, pos)))
@@ -235,7 +235,7 @@ function _read_bang(data, pos, start)
     end
 
     # <!DOCTYPE ...> or other <! declaration
-    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
         pos += 1
     end
     tok = Token(TOKEN_DOCTYPE_OPEN, @inbounds SubString(data, start, pos - 1))
@@ -245,7 +245,7 @@ end
 #-----------------------------------------------------------------------# <? (PI / XML declaration)
 function _read_pi_start(data, pos, start)
     name_start = pos
-    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
         pos += 1
     end
 
@@ -265,7 +265,7 @@ end
 
 #-----------------------------------------------------------------------# Tags
 function _read_open_tag_start(data, pos, start)
-    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
         pos += 1
     end
     tok = Token(TOKEN_OPEN_TAG, @inbounds SubString(data, start, pos - 1))
@@ -273,7 +273,7 @@ function _read_open_tag_start(data, pos, start)
 end
 
 function _read_close_tag_start(data, pos, start)
-    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
         pos += 1
     end
     tok = Token(TOKEN_CLOSE_TAG, @inbounds SubString(data, start, pos - 1))
@@ -315,7 +315,7 @@ function _read_in_tag(data, pos, mode)
 
     # Attribute name
     name_start = pos
-    while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+    @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
         pos += 1
     end
     name_end = pos - 1
@@ -340,7 +340,7 @@ function _read_attr_value(data, pos, mode)
 
     start = pos
     pos += 1  # skip opening quote
-    while !_iseof(data, pos) && _peek(data, pos) != q
+    @inbounds while !_iseof(data, pos) && _peek(data, pos) != q
         pos += 1
     end
     _iseof(data, pos) && _err("unterminated attribute value", start)
@@ -354,15 +354,15 @@ end
 #-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
 function _read_comment_body(data, pos)
     start = pos
-    while !_iseof(data, pos)
+    @inbounds while !_iseof(data, pos)
         if _peek(data, pos) == UInt8('-') &&
            _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
            _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
             content_end = prevind(data, pos)
             close_start = pos
             pos += 3
-            pending = Token(TOKEN_COMMENT_CLOSE, @inbounds SubString(data, close_start, pos - 1))
-            tok = Token(TOKEN_COMMENT_CONTENT, @inbounds SubString(data, start, content_end))
+            pending = Token(TOKEN_COMMENT_CLOSE, SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_COMMENT_CONTENT, SubString(data, start, content_end))
             return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
@@ -372,15 +372,15 @@ end
 
 function _read_cdata_body(data, pos)
     start = pos
-    while !_iseof(data, pos)
+    @inbounds while !_iseof(data, pos)
         if _peek(data, pos) == UInt8(']') &&
            _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8(']') &&
            _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
             content_end = prevind(data, pos)
             close_start = pos
             pos += 3
-            pending = Token(TOKEN_CDATA_CLOSE, @inbounds SubString(data, close_start, pos - 1))
-            tok = Token(TOKEN_CDATA_CONTENT, @inbounds SubString(data, start, content_end))
+            pending = Token(TOKEN_CDATA_CLOSE, SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_CDATA_CONTENT, SubString(data, start, content_end))
             return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
@@ -390,13 +390,13 @@ end
 
 function _read_pi_body(data, pos)
     start = pos
-    while !_iseof(data, pos)
+    @inbounds while !_iseof(data, pos)
         if _peek(data, pos) == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
             content_end = prevind(data, pos)
             close_start = pos
             pos += 2
-            pending = Token(TOKEN_PI_CLOSE, @inbounds SubString(data, close_start, pos - 1))
-            tok = Token(TOKEN_PI_CONTENT, @inbounds SubString(data, start, content_end))
+            pending = Token(TOKEN_PI_CLOSE, SubString(data, close_start, pos - 1))
+            tok = Token(TOKEN_PI_CONTENT, SubString(data, start, content_end))
             return (tok, TokenizerState(pos, _M_DEFAULT, pending))
         end
         pos += 1
@@ -407,7 +407,7 @@ end
 function _read_doctype_body(data, pos)
     start = pos
     depth = 0
-    while !_iseof(data, pos)
+    @inbounds while !_iseof(data, pos)
         b = _peek(data, pos)
         if b == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
                 pos >= 2 &&
diff --git a/src/lazynode.jl b/src/lazynode.jl
index c9b40d2..2f7fea5 100644
--- a/src/lazynode.jl
+++ b/src/lazynode.jl
@@ -78,7 +78,7 @@ function attributes(n::LazyNode)
         result === nothing && break
         push!(attrs, name => unescape(attr_value(result[1])))
     end
-    isempty(attrs) ? nothing : Dict(attrs)
+    isempty(attrs) ? nothing : Attributes(attrs)
 end
 
 function Base.get(n::LazyNode, key::AbstractString, default)
diff --git a/src/xpath.jl b/src/xpath.jl
index b0a9725..e56b38d 100644
--- a/src/xpath.jl
+++ b/src/xpath.jl
@@ -118,6 +118,9 @@ end
 
 #-----------------------------------------------------------------------------# Predicate evaluation
 
+const _RE_ATTR_PRED = r"^@([A-Za-z_:][\w.\-:]*)$"
+const _RE_ATTR_VAL_PRED = r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$"
+
 function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S
     s = strip(predicate)
 
@@ -135,14 +138,14 @@ function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root
     end
 
     # [@attr] — has attribute
-    m = match(r"^@([A-Za-z_:][\w.\-:]*)$", s)
+    m = match(_RE_ATTR_PRED, s)
     if !isnothing(m)
         attr_name = m.captures[1]
         return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes)
     end
 
     # [@attr='value'] or [@attr="value"]
-    m = match(r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$", s)
+    m = match(_RE_ATTR_VAL_PRED, s)
     if !isnothing(m)
         attr_name = m.captures[1]
         attr_val = m.captures[2]
diff --git a/test/runtests.jl b/test/runtests.jl
index c3419ce..49632a0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -442,7 +442,7 @@ end
     @testset "multiple attributes accessible via attributes()" begin
         doc = parse("""<x first="1" second="2" third="3"/>""", Node)
         attrs = attributes(doc[1])
-        @test attrs isa Dict
+        @test attrs isa Attributes
         @test attrs["first"] == "1"
         @test attrs["second"] == "2"
         @test attrs["third"] == "3"
@@ -2828,7 +2828,7 @@ end
     @testset "Element attributes" begin
         doc = parse("""<root a="1" b="2"/>""", LazyNode)
         attrs = attributes(doc[1])
-        @test attrs isa Dict
+        @test attrs isa Attributes
         @test attrs["a"] == "1"
         @test attrs["b"] == "2"
     end