diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5e073ac..b52ab78 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -26,7 +26,7 @@ jobs:
- os: macOS-latest
arch: x86
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
@@ -41,9 +41,13 @@ jobs:
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
+ - uses: actions/cache@v4
+ with:
+ path: test/data/w3c
+ key: w3c-xmlconf-v20130923
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- - uses: codecov/codecov-action@v1
+ - uses: codecov/codecov-action@v5
with:
- file: lcov.info
+ files: lcov.info
diff --git a/.gitignore b/.gitignore
index b000475..929dfc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
*Manifest.toml
-*generated_xsd.jl
-*.xml
*.gz
+*.tar
*.DS_Store
+*.claude
+test/data/w3c/
+benchmarks/data/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..3f00ed1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,157 @@
+# Changelog
+
+All notable changes to XML.jl will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+- New streaming tokenizer (`XMLTokenizer` module) for fine-grained XML token iteration.
+- XPath support via `xpath(node, path)`.
+- `test/test_libxml2_testcases.jl`: 243 test cases borrowed from the [libxml2](https://github.com/GNOME/libxml2) test suite covering CDATA, comments, processing instructions, attributes, namespaces, DTD internal subsets, entity references, whitespace handling, Unicode, error cases, and real-world document patterns.
+
+### Fixed
+- **Tokenizer: multi-byte UTF-8 in attribute values** — Parsing attribute values containing multi-byte UTF-8 characters (e.g., ` `) could produce a `StringIndexError` because `attr_value()` used byte arithmetic (`ncodeunits - 1`) instead of `prevind` to strip quotes. The same issue existed in `_read_attr_value!`.
+- **Tokenizer: quotes inside DTD comments** — A `"` or `'` character inside a `` comment within a DTD internal subset caused the tokenizer to misinterpret it as a quoted string delimiter, leading to an "Unterminated quoted string" error. The DOCTYPE body parser now correctly skips comment content.
+
+## [0.3.8]
+
+### Fixed
+- `XML.write` now respects `xml:space="preserve"` and suppresses indentation for elements with this attribute ([#49]).
+
+## [0.3.7]
+
+### Fixed
+- Resolved remaining issues from [#45] and fixed [#46] (whitespace preservation edge cases) ([#47]).
+
+## [0.3.6]
+
+### Added
+- `XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation ([#45]).
+
+### Fixed
+- `String` type ambiguity on Julia nightly resolved ([#38]).
+
+## [0.3.5]
+
+### Fixed
+- `depth` and `parent` functions corrected to work properly with the DOM tree API ([#37]).
+- `escape` updated to no longer be idempotent — every `&` is now escaped, matching spec behavior ([#32], addressing [#31]).
+- `pushfirst!` support added for `Node` children ([#29]).
+
+## [0.3.4]
+
+### Fixed
+- Fixed [#26].
+- CI updated to use `julia-actions/cache@v4` and `lts` Julia version.
+
+## [0.3.3]
+
+### Added
+- `h` constructor for concise element creation (e.g., `h.div("hello"; class="main")`).
+
+### Fixed
+- Path definition error in README example ([#20]).
+
+## [0.3.2]
+
+### Fixed
+- Minor typos.
+
+## [0.3.1]
+
+### Added
+- Julia 1.6 compatibility ([#16]).
+
+### Changed
+- Smarter escaping logic.
+
+## [0.3.0]
+
+### Changed
+- Attribute internal representation changed from `Dict` to `OrderedDict` (later reverted to `Vector{Pair}`).
+
+## [0.2.3]
+
+### Fixed
+- Parse method fix.
+
+## [0.2.2]
+
+### Added
+- DTD parsing via `parse_dtd`.
+- `is_simple` and `simple_value` exports.
+- `setindex!` methods for modifying attributes.
+- `unescape` function.
+
+### Fixed
+- DOCTYPE parsing made case-insensitive.
+
+## [0.2.1]
+
+### Fixed
+- Write output fixes.
+
+## [0.2.0]
+
+### Changed
+- Major rewrite: introduced `NodeType` enum, `Node{S}` parametric struct, callable `NodeType` constructors, and `XML.write`.
+- Processing instruction support.
+- Benchmarks added.
+
+## [0.1.3]
+
+### Changed
+- Improved print output for `AbstractXMLNode`.
+
+## [0.1.2]
+
+### Added
+- AbstractTrees 0.4 compatibility ([#5]).
+
+## [0.1.1]
+
+### Added
+- `Node` implementation with `print_tree`.
+- Color output in REPL display.
+- Stopped stripping whitespace from text nodes.
+
+## [0.1.0]
+
+- Initial release.
+
+[Unreleased]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.8...HEAD
+[0.3.8]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.7...v0.3.8
+[0.3.7]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.6...v0.3.7
+[0.3.6]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.5...v0.3.6
+[0.3.5]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.4...v0.3.5
+[0.3.4]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.3...v0.3.4
+[0.3.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.2...v0.3.3
+[0.3.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.1...v0.3.2
+[0.3.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.0...v0.3.1
+[0.3.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.3...v0.3.0
+[0.2.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.2...v0.2.3
+[0.2.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.1...v0.2.2
+[0.2.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.0...v0.2.1
+[0.2.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.3...v0.2.0
+[0.1.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.2...v0.1.3
+[0.1.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.1...v0.1.2
+[0.1.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.0...v0.1.1
+[0.1.0]: https://github.com/JuliaComputing/XML.jl/releases/tag/v0.1.0
+
+[#5]: https://github.com/JuliaComputing/XML.jl/pull/5
+[#16]: https://github.com/JuliaComputing/XML.jl/pull/16
+[#20]: https://github.com/JuliaComputing/XML.jl/pull/20
+[#26]: https://github.com/JuliaComputing/XML.jl/issues/26
+[#29]: https://github.com/JuliaComputing/XML.jl/pull/29
+[#31]: https://github.com/JuliaComputing/XML.jl/issues/31
+[#32]: https://github.com/JuliaComputing/XML.jl/pull/32
+[#37]: https://github.com/JuliaComputing/XML.jl/pull/37
+[#38]: https://github.com/JuliaComputing/XML.jl/pull/38
+[#43]: https://github.com/JuliaComputing/XML.jl/issues/43
+[#45]: https://github.com/JuliaComputing/XML.jl/pull/45
+[#46]: https://github.com/JuliaComputing/XML.jl/issues/46
+[#47]: https://github.com/JuliaComputing/XML.jl/pull/47
+[#49]: https://github.com/JuliaComputing/XML.jl/pull/49
diff --git a/Project.toml b/Project.toml
index 49b96c0..65925ad 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,12 +1,7 @@
name = "XML"
uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
+version = "0.4.0"
authors = ["Josh Day and contributors"]
-version = "0.3.8"
-
-[deps]
-Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
[compat]
-OrderedCollections = "1.4, 1.5"
-julia = "1.6"
+julia = "1.9"
diff --git a/README.md b/README.md
index ddb1156..e50a591 100644
--- a/README.md
+++ b/README.md
@@ -4,39 +4,8 @@
Read and write XML in pure Julia.
-
-
-# Introduction
-
-This package offers fast data structures for reading and writing XML files with a consistent interface:
-
-### `Node`/`LazyNode` Interface:
-
-```
-nodetype(node) → XML.NodeType (an enum type)
-tag(node) → String or Nothing
-attributes(node) → OrderedDict{String, String} or Nothing
-value(node) → String or Nothing
-children(node) → Vector{typeof(node)}
-is_simple(node) → Bool (whether node is simple .e.g. item )
-simple_value(node) → e.g. "item" from item )
-```
-
-
-
-### Extended Interface for `LazyNode`
-
-```
-depth(node) → Int
-next(node) → typeof(node)
-prev(node) → typeof(node)
-parent(node) → typeof(node)
-```
-
-
-
# Quickstart
```julia
@@ -58,79 +27,76 @@ doc[end][2] # Second child of root
# Node Element (6 children)
```
-
-
-# Data Structures that Represent XML Nodes
+
-## Preliminary: `NodeType`
+# `Node` Interface
-- Each item in an XML DOM is classified by its `NodeType`.
-- Every `XML.jl` struct defines a `nodetype(x)` method that returns its `NodeType`.
+Every node in the XML DOM is represented by `Node`, a single type parametrized on its string storage.
-| NodeType | XML Representation | `Node` Constructor |
-|----------|--------------------|------------------|
-| `Document` | An entire document | `Document(children...)`
-| `DTD` | `` | `DTD(...) `
-| `Declaration` | `` | `Declaration(; attrs...)`
-| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)`
-| `Comment` | `` | `Comment(text)`
-| `CData` | `` | `CData(text)`
-| `Element` | ` children... ` | `Element(tag, children...; attrs...)`
-| `Text` | the `text` part of `text ` | `Text(text)`
+```
+nodetype(node) -> XML.NodeType (an enum)
+tag(node) -> String or Nothing
+attributes(node) -> Dict{String, String} or Nothing
+value(node) -> String or Nothing
+children(node) -> Vector{Node}
+is_simple(node) -> Bool (e.g. text )
+simple_value(node) -> e.g. "text" from text
+```
-## `Node`: Probably What You're Looking For
-
-- `read`-ing a `Node` loads the entire XML DOM in memory.
-- See the table above for convenience constructors.
-- `Node`s have some additional methods that aid in construction/mutation:
-
-```julia
-# Add a child:
-push!(parent::Node, child::Node)
+## `NodeType`
-# Replace a child:
-parent[2] = child
+Each item in an XML DOM is classified by its `NodeType`:
-# Add/change an attribute:
-node["key"] = value
+| NodeType | XML Representation | Constructor |
+|----------|--------------------|-------------|
+| `Document` | An entire document | `Document(children...)` |
+| `DTD` | `` | `DTD(...)` |
+| `Declaration` | `` | `Declaration(; attrs...)` |
+| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` |
+| `Comment` | `` | `Comment(text)` |
+| `CData` | `` | `CData(text)` |
+| `Element` | ` children... ` | `Element(tag, children...; attrs...)` |
+| `Text` | the `text` part of `text ` | `Text(text)` |
-node["key"]
-```
+
-- `Node` is an immutable type. However, you can easily create a copy with one or more field values changed by using the `Node(::Node, children...; attrs...)` constructor where `children` are appended to the source node's children and `attrs` are appended to the node's attributes.
+## Mutation
```julia
-node = XML.Element("tag", "child")
-# Node Element (1 child)
+push!(parent, child) # Add a child
+parent[2] = child # Replace a child
+node["key"] = "value" # Add/change an attribute
+node["key"] # Get an attribute
+```
-simple_value(node)
-# "child"
+
-node2 = Node(node, "added"; id="my-id")
-# Node Element (2 children)
+## Tree Navigation
-node2.children
-# 2-element Vector{Node}:
-# Node Text "child"
-# Node Text "added"
+```julia
+depth(child, root) # Depth of child relative to root
+parent(child, root) # Parent of child within root's tree
+siblings(child, root) # Siblings of child within root's tree
```
-### Writing `Element` `Node`s with `XML.h`
+
+
+## Writing Elements with `XML.h`
Similar to [Cobweb.jl](https://github.com/JuliaComputing/Cobweb.jl#-creating-nodes-with-cobwebh), `XML.h` enables you to write elements with a simpler syntax:
```julia
using XML: h
-julia> node = h.parent(
- h.child("first child content", id="id1"),
- h.child("second child content", id="id2")
- )
+node = h.parent(
+ h.child("first child content", id="id1"),
+ h.child("second child content", id="id2")
+)
# Node Element (2 children)
-julia> print(XML.write(node))
+print(XML.write(node))
#
# first child content
# second child content
@@ -139,111 +105,193 @@ julia> print(XML.write(node))
-## `XML.LazyNode`: For Fast Iteration through an XML File
+# Reading
-A lazy data structure that just keeps track of the position in the raw data (`Vector{UInt8}`) to read from.
+```julia
+# From a file:
+read(filename, Node)
-- You can iterate over a `LazyNode` to "read" through an XML file:
+# From a string:
+parse(str, Node)
+```
+
+
+
+# Writing
```julia
-doc = read(filename, LazyNode)
-
-foreach(println, doc)
-# LazyNode Declaration
-# LazyNode Element
-# LazyNode Element
-# LazyNode Element
-# LazyNode Text "Gambardella, Matthew"
-# LazyNode Element
-# ⋮
+XML.write(filename::String, node) # write to file
+XML.write(io::IO, node) # write to stream
+XML.write(node) # return String
```
-
+`XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation.
-# Reading
+
-```julia
-# Reading from file:
-read(filename, Node)
-read(filename, LazyNode)
+# XPath
-# Parsing from string:
-parse(Node, str)
-parse(LazyNode, str)
+Query nodes using a subset of XPath 1.0 via `xpath(node, path)`:
+```julia
+doc = parse("""
+
+ hello
+ world
+
+""", Node)
+
+root = doc[end]
+
+xpath(root, "//b") # All descendants
+xpath(root, "a[@id='2']/b") # inside
+xpath(root, "a[1]") # First child
+xpath(root, "//b/text()") # Text nodes inside all s
```
-
+### Supported syntax
+
+| Expression | Description |
+|------------|-------------|
+| `/` | Root / path separator |
+| `tag` | Child element by name |
+| `*` | Any child element |
+| `//` | Descendant-or-self (recursive) |
+| `.` | Current node |
+| `..` | Parent node |
+| `[n]` | Positional predicate (1-based) |
+| `[@attr]` | Has-attribute predicate |
+| `[@attr='v']` | Attribute-value predicate |
+| `text()` | Text node children |
+| `node()` | All node children |
+| `@attr` | Attribute value (returns strings) |
-# Writing
+
+
+# Streaming Tokenizer
+
+For large files or when you need fine-grained control, `XML.XMLTokenizer` provides a streaming tokenizer that yields tokens without building a DOM:
```julia
-XML.write(filename::String, node) # write to file
+using XML.XMLTokenizer
+
+for token in tokenize("text ")
+ println(token.kind, " => ", repr(String(token.raw)))
+end
+# TOKEN_OPEN_TAG => " ">"
+# TOKEN_OPEN_TAG => " "attr"
+# TOKEN_ATTR_VALUE => "\"val\""
+# TOKEN_TAG_CLOSE => ">"
+# TOKEN_TEXT => "text"
+# TOKEN_CLOSE_TAG => " ">"
+# TOKEN_CLOSE_TAG => " ">"
+```
-XML.write(io::IO, node) # write to stream
+
+
+# `LazyNode`
+
+For read-only access without building a full DOM tree, use `LazyNode`. It stores only a reference to the source string and re-tokenizes on demand, using significantly less memory:
-XML.write(node) # String
+```julia
+doc = parse(xml_string, LazyNode)
+doc = read("file.xml", LazyNode)
```
+`LazyNode` supports the same read-only interface as `Node`: `nodetype`, `tag`, `attributes`, `value`, `children`, `is_simple`, `simple_value`, plus integer and string indexing.
-
+### Memory-mapped files
-# Performance
+For very large files, combine `LazyNode` with memory mapping to avoid reading the entire file into heap memory:
-- XML.jl performs comparatively to [EzXML.jl](https://github.com/JuliaIO/EzXML.jl), which wraps the C library [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home).
-- See the `benchmarks/suite.jl` for the code to produce these results.
-- The following output was generated in a Julia session with the following `versioninfo`:
+```julia
+using XML, Mmap, StringViews
-```
-julia> versioninfo()
-Julia Version 1.9.4
-Commit 8e5136fa297 (2023-11-14 08:46 UTC)
-Build Info:
- Official https://julialang.org/ release
-Platform Info:
- OS: macOS (arm64-apple-darwin22.4.0)
- CPU: 10 × Apple M1 Pro
- WORD_SIZE: 64
- LIBM: libopenlibm
- LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
- Threads: 8 on 8 virtual cores
+doc = open("very_large.xml") do io
+ sv = StringView(Mmap.mmap(io))
+ parse(sv, LazyNode)
+end
```
+
-### Reading an XML File
+# Benchmarks
-```
- XML.LazyNode 0.009583
- XML.Node ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1071.32
- EzXML.readxml ■■■■■■■■■ 284.346
- XMLDict.xml_dict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1231.47
-```
+Benchmark source: [benchmarks.jl](benchmarks/benchmarks.jl). Test data: `books.xml` (small, ~4 KB) and a generated XMark auction XML (medium, ~14 MB).
-### Writing an XML File
-```
- Write: XML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 289.638
- Write: EzXML ■■■■■■■■■■■■■ 93.4631
-```
-### Lazily Iterating over Each Node
-```
- LazyNode ■■■■■■■■■ 51.752
- EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 226.271
```
+ Parse (small) — median time (ms)
-### Collecting All Names/Tags in an XML File
-```
- XML.LazyNode ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 210.482
- EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 276.238
- EzXML.readxml ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 263.269
-```
+ XML.jl ■■■■■■■ 0.041
+ XML.jl (SS) ■■■■■■ 0.034
+ EzXML ■■■■■ 0.030
+ LightXML ■■■■■■ 0.033
+ XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.232
+
+
+ Parse (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■ 194.2
+ XML.jl (SS) ■■■■■■■■■■ 172.8
+ EzXML ■■■■■■ 105.8
+ LightXML ■■■■■■ 105.0
+ XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 687.7
+
+
+ Write (small) — median time (ms)
+
+ XML.jl ■■■■■■■■ 0.021
+ EzXML ■■■■ 0.012
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.110
+
+
+ Write (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 93.2
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 84.6
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 60.4
-
-
-# Possible Gotchas
+ Read file — median time (ms)
-- XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you. However, we provide utility functions for doing the conversions back and forth:
- - `XML.escape(::String)` and `XML.unescape(::String)`
- - `XML.escape!(::Node)` and `XML.unescape!(::Node)`.
+ XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 214.1
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 143.1
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■ 121.9
+
+
+ Collect tags (small) — median time (ms)
+
+ XML.jl ■■■■■■ 0.000698
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■ 0.00255
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00430
+
+
+ Collect tags (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■■■■■■ 12.6
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 20.5
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 27.6
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
index ed90996..043988c 100644
--- a/benchmarks/Project.toml
+++ b/benchmarks/Project.toml
@@ -2,7 +2,8 @@
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
XML = "72c71f33-b9b6-44de-8c94-c961784809e2"
XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"
diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl
new file mode 100644
index 0000000..7f780a0
--- /dev/null
+++ b/benchmarks/XMarkGenerator.jl
@@ -0,0 +1,377 @@
+"""
+ XMarkGenerator
+
+XMark-inspired XML benchmark data generator. Produces well-formed XML documents modeling an
+internet auction site, following the XMark benchmark DTD structure.
+
+ include("xml_generator.jl")
+ using .XMarkGenerator
+
+ xml = generate_xmark(1.0) # return String (~14 MB)
+ generate_xmark("out.xml", 5.0) # write to file (~68 MB)
+ generate_xmark(stdout, 0.1; seed=123) # write to IO (~1.4 MB)
+"""
+module XMarkGenerator
+
+using Random
+
+export generate_xmark
+
+#-----------------------------------------------------------------# Word lists
+const WORDS = [
+ "about", "above", "across", "after", "again", "against", "along", "already", "also",
+ "always", "among", "another", "answer", "around", "asked", "away", "back", "because",
+ "become", "been", "before", "began", "behind", "being", "below", "between", "body",
+ "book", "both", "brought", "build", "built", "business", "came", "cannot", "carry",
+ "cause", "certain", "change", "children", "city", "close", "come", "complete", "could",
+ "country", "course", "cover", "current", "dark", "days", "deep", "development",
+ "different", "direction", "does", "done", "door", "down", "draw", "during", "each",
+ "early", "earth", "east", "education", "effort", "eight", "either", "else", "end",
+ "enough", "even", "every", "example", "experience", "face", "fact", "family", "feel",
+ "field", "find", "first", "five", "follow", "food", "force", "form", "found", "four",
+ "from", "full", "gave", "general", "give", "going", "gone", "good", "government",
+ "great", "green", "ground", "group", "grow", "half", "hand", "happen", "hard", "have",
+ "head", "help", "here", "high", "himself", "hold", "home", "hope", "house", "however",
+ "hundred", "idea", "important", "inch", "include", "increase", "island", "just", "keep",
+ "kind", "knew", "know", "land", "large", "last", "later", "learn", "left", "less",
+ "letter", "life", "light", "like", "line", "list", "little", "live", "long", "look",
+ "lost", "made", "main", "make", "many", "mark", "matter", "mean", "might", "mind",
+ "miss", "money", "morning", "most", "mother", "move", "much", "music", "must", "name",
+ "near", "need", "never", "next", "night", "nothing", "notice", "number", "often",
+ "once", "only", "open", "order", "other", "over", "page", "paper", "part", "past",
+ "pattern", "people", "perhaps", "period", "person", "picture", "place", "plan", "plant",
+ "play", "point", "position", "possible", "power", "present", "problem", "produce",
+ "product", "program", "public", "pull", "purpose", "question", "quite", "reach", "read",
+ "real", "receive", "record", "remember", "rest", "result", "right", "river", "room",
+ "round", "rule", "same", "school", "second", "seem", "sentence", "service", "seven",
+ "several", "shall", "short", "should", "show", "side", "since", "sing", "size", "small",
+ "social", "some", "song", "soon", "south", "space", "stand", "start", "state", "still",
+ "stood", "story", "strong", "study", "such", "sure", "system", "table", "take", "tell",
+ "test", "their", "them", "then", "there", "these", "thing", "think", "those", "thought",
+ "three", "through", "time", "together", "took", "toward", "travel", "tree", "true",
+ "turn", "under", "unit", "until", "upon", "usually", "value", "very", "voice", "walk",
+ "want", "watch", "water", "well", "went", "were", "west", "what", "where", "which",
+ "while", "white", "whole", "will", "with", "without", "woman", "word", "work", "world",
+ "would", "write", "year", "young",
+]
+const FIRST_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard",
+ "Joseph", "Thomas", "Charles", "Mary", "Patricia", "Jennifer", "Linda", "Barbara",
+ "Elizabeth", "Susan", "Jessica", "Sarah", "Karen"]
+const LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller",
+ "Davis", "Rodriguez", "Martinez", "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez",
+ "Moore", "Martin", "Jackson", "Thompson", "White"]
+const COUNTRIES = ["United States", "Germany", "France", "Japan", "Australia", "Brazil",
+ "Canada", "India", "China", "Mexico", "Argentina", "Spain", "Italy", "United Kingdom",
+ "Netherlands", "Sweden", "Norway", "Finland", "Denmark", "Belgium"]
+const CITIES = ["New York", "London", "Paris", "Tokyo", "Sydney", "Berlin", "Rome",
+ "Madrid", "Amsterdam", "Toronto", "Moscow", "Beijing", "Seoul", "Mumbai", "Cairo",
+ "Dublin", "Prague", "Vienna", "Warsaw", "Budapest"]
+const STREETS = ["Main", "Oak", "Elm", "Maple", "Pine", "Cedar", "Birch", "Walnut",
+ "Cherry", "Ash", "Spruce", "Willow", "Poplar", "Laurel", "Juniper"]
+const EDUCATIONS = ["High School", "College", "Graduate", "Associate", "Master", "Doctorate"]
+const GENDERS = ["male", "female"]
+const PAYMENTS = ["Creditcard", "Money order", "Personal check", "Cash"]
+const SHIPPING = ["Will ship only within country", "Will ship internationally",
+ "Buyer pays fixed shipping costs", "Free shipping", "See description for shipping"]
+const REGIONS = ["africa", "asia", "australia", "europe", "namerica", "samerica"]
+
+#-----------------------------------------------------------------# Random data helpers
+rand_word(rng) = rand(rng, WORDS)
+rand_date(rng) = string(rand(rng, 1999:2025), "/", lpad(rand(rng, 1:12), 2, '0'), "/", lpad(rand(rng, 1:28), 2, '0'))
+rand_time(rng) = string(lpad(rand(rng, 0:23), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'))
+rand_price(rng) = string(rand(rng, 1:9999), ".", lpad(rand(rng, 0:99), 2, '0'))
+rand_phone(rng) = string("+", rand(rng, 1:99), " (", rand(rng, 100:999), ") ", rand(rng, 1000000:9999999))
+rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0'))
+rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ")
+rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com")
+
+#-----------------------------------------------------------------# XML writing helpers
+function xml_escape_char(io::IO, c::Char)
+ if c == '&'; print(io, "&")
+ elseif c == '<'; print(io, "<")
+ elseif c == '>'; print(io, ">")
+ elseif c == '"'; print(io, """)
+ else; print(io, c)
+ end
+end
+
+function write_escaped(io::IO, s::AbstractString)
+ for c in s
+ xml_escape_char(io, c)
+ end
+end
+
+function write_text_content(rng, io; min_words=10, max_words=50)
+ n = rand(rng, min_words:max_words)
+ for i in 1:n
+ i > 1 && print(io, ' ')
+ w = rand_word(rng)
+ r = rand(rng)
+ if r < 0.03
+ print(io, "", w, " ")
+ elseif r < 0.06
+ print(io, "", w, " ")
+ elseif r < 0.08
+ print(io, "", w, " ")
+ else
+ print(io, w)
+ end
+ end
+end
+
+function write_description(rng, io, indent)
+ println(io, indent, "")
+ if rand(rng) < 0.7
+ print(io, indent, " ")
+ write_text_content(rng, io; min_words=15, max_words=80)
+ println(io, " ")
+ else
+ println(io, indent, " ")
+ for _ in 1:rand(rng, 2:6)
+ print(io, indent, " ")
+ write_text_content(rng, io; min_words=8, max_words=40)
+ println(io, " ")
+ end
+ println(io, indent, " ")
+ end
+ println(io, indent, " ")
+end
+
+function write_annotation(rng, io, indent, n_people)
+ println(io, indent, "")
+ println(io, indent, " ")
+ write_description(rng, io, string(indent, " "))
+ println(io, indent, " ", rand(rng, 1:10), " ")
+ println(io, indent, " ")
+end
+
+#-----------------------------------------------------------------# Section writers
+function write_item(rng, io, id, n_categories)
+ featured = rand(rng) < 0.1 ? " featured=\"yes\"" : ""
+ println(io, " - ")
+ println(io, "
", rand(rng, CITIES), " ")
+ println(io, " ", rand(rng, 1:50), " ")
+ println(io, " ", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), " ")
+ println(io, " ", rand(rng, PAYMENTS), " ")
+ write_description(rng, io, " ")
+ println(io, " ", rand(rng, SHIPPING), " ")
+ for _ in 1:rand(rng, 1:3)
+ println(io, " ")
+ end
+ println(io, " ")
+ for _ in 1:rand(rng, 0:5)
+ println(io, " ")
+ println(io, " ", rand_email(rng), " ")
+ println(io, " ", rand_email(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ print(io, " ")
+ write_text_content(rng, io; min_words=10, max_words=60)
+ println(io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+ println(io, " ")
+end
+
+function write_categories(rng, io, n)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand_word(rng), " ", rand_word(rng), " ")
+ write_description(rng, io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_catgraph(rng, io, n_edges, n_categories)
+ println(io, " ")
+ for _ in 1:n_edges
+ from = string("category",rand(rng, 1:n_categories))
+ to = string("category",rand(rng, 1:n_categories))
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_people(rng, io, n, n_categories, n_open)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), " ")
+ println(io, " ", rand_email(rng), " ")
+ if rand(rng) < 0.8
+ println(io, " ", rand_phone(rng), " ")
+ end
+ if rand(rng) < 0.7
+ println(io, " ")
+ println(io, " ", rand(rng, 1:9999), " ", rand(rng, STREETS), " St ")
+ println(io, " ", rand(rng, CITIES), " ")
+ println(io, " ", rand(rng, COUNTRIES), " ")
+ if rand(rng) < 0.5
+ println(io, " ", rand_word(rng), " ")
+ end
+ println(io, " ", rand_zip(rng), " ")
+ println(io, " ")
+ end
+ if rand(rng) < 0.5
+ println(io, " http://www.", lowercase(rand(rng, LAST_NAMES)), ".com/~",
+ lowercase(rand(rng, FIRST_NAMES)), " ")
+ end
+ if rand(rng) < 0.6
+ println(io, " ", rand_cc(rng), " ")
+ end
+ if rand(rng) < 0.7
+ income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : ""
+ println(io, " ")
+ for _ in 1:rand(rng, 0:4)
+ println(io, " ")
+ end
+ if rand(rng) < 0.8
+ println(io, " ", rand(rng, EDUCATIONS), " ")
+ end
+ if rand(rng) < 0.7
+ println(io, " ", rand(rng, GENDERS), " ")
+ end
+ println(io, " ", rand_word(rng), " ")
+ if rand(rng) < 0.8
+ println(io, " ", rand(rng, 18:85), " ")
+ end
+ println(io, " ")
+ end
+ if n_open > 0 && rand(rng) < 0.3
+ println(io, " ")
+ for _ in 1:rand(rng, 1:5)
+ println(io, " ")
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_open_auctions(rng, io, n, n_items, n_people)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ if rand(rng) < 0.5
+ println(io, " ", rand_price(rng), " ")
+ end
+ for _ in 1:rand(rng, 0:12)
+ println(io, " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand_time(rng), " ")
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ println(io, " ")
+ end
+ println(io, " ", rand_price(rng), " ")
+ if rand(rng) < 0.3
+ println(io, " ", rand(rng, ["Yes", "No"]), " ")
+ end
+ println(io, " ")
+ println(io, " ")
+ write_annotation(rng, io, " ", n_people)
+ println(io, " ", rand(rng, 1:10), " ")
+ println(io, " ", rand(rng, ["Regular", "Featured"]), " ")
+ println(io, " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_closed_auctions(rng, io, n, n_open, n_items, n_people)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ")
+ println(io, " ")
+ # Use item IDs that don't overlap with open auctions
+ item_id = n_open + i
+ item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items)
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand(rng, 1:10), " ")
+ println(io, " ", rand(rng, ["Regular", "Featured"]), " ")
+ if rand(rng) < 0.7
+ write_annotation(rng, io, " ", n_people)
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+#-----------------------------------------------------------------# Main entry points
+"""
+ generate_xmark([io_or_filename], factor; seed=42)
+
+Generate an XMark-style auction XML document. `factor` scales all entity counts linearly.
+
+Approximate output sizes (may vary slightly):
+- `factor=0.1` → ~1.4 MB
+- `factor=1.0` → ~14 MB
+- `factor=2.0` → ~27 MB
+- `factor=5.0` → ~68 MB
+"""
+function generate_xmark(io::IO, factor::Real; seed::Int=42)
+ factor > 0 || throw(ArgumentError("factor must be positive, got $factor"))
+ rng = Xoshiro(seed)
+
+ n_per_region = max(1, round(Int, 500 * factor))
+ n_people = max(1, round(Int, 5000 * factor))
+ n_categories = max(1, round(Int, 200 * factor))
+ n_open = max(1, round(Int, 2000 * factor))
+ n_closed = max(1, round(Int, 1500 * factor))
+ n_edges = max(1, round(Int, 1000 * factor))
+ n_items = n_per_region * 6
+
+ # Clamp auctions to available items
+ n_open = min(n_open, n_items)
+ n_closed = min(n_closed, max(1, n_items - n_open))
+
+ println(io, "")
+ println(io, "")
+
+ # Regions with items
+ println(io, " ")
+ item_id = 0
+ for region in REGIONS
+ println(io, " <", region, ">")
+ for _ in 1:n_per_region
+ item_id += 1
+ write_item(rng, io, item_id, n_categories)
+ end
+ println(io, " ", region, ">")
+ end
+ println(io, " ")
+
+ write_categories(rng, io, n_categories)
+ write_catgraph(rng, io, n_edges, n_categories)
+ write_people(rng, io, n_people, n_categories, n_open)
+ write_open_auctions(rng, io, n_open, n_items, n_people)
+ write_closed_auctions(rng, io, n_closed, n_open, n_items, n_people)
+
+ println(io, " ")
+ nothing
+end
+
+function generate_xmark(filename::AbstractString, factor::Real; seed::Int=42)
+ open(filename, "w") do io
+ generate_xmark(io, factor; seed)
+ end
+ filename
+end
+
+function generate_xmark(factor::Real; seed::Int=42)
+ io = IOBuffer()
+ generate_xmark(io, factor; seed)
+ String(take!(io))
+end
+
+end # module
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
new file mode 100644
index 0000000..16096e8
--- /dev/null
+++ b/benchmarks/benchmarks.jl
@@ -0,0 +1,161 @@
+using XML
+using XML: Element, nodetype, tag, children
+using EzXML: EzXML
+using XMLDict: XMLDict
+using LightXML: LightXML
+using BenchmarkTools
+using DataFrames
+using InteractiveUtils
+
+include("XMarkGenerator.jl")
+using .XMarkGenerator
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
+
+#-----------------------------------------------------------------------------# Test data
+# Small file (~120 lines)
+small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml")
+small_xml = read(small_file, String)
+
+# Medium file (generated XMark auction XML, ~14 MB)
+medium_file = joinpath(@__DIR__, "data", "xmark.xml")
+if !isfile(medium_file)
+ mkpath(dirname(medium_file))
+ @info "Generating XMark benchmark XML..."
+ generate_xmark(medium_file, 1.0)
+end
+medium_xml = read(medium_file, String)
+
+df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
+
+macro add_benchmark(kind, name, expr...)
+ esc(:(let
+ @info string($kind, " - ", $name)
+ bench = @benchmark $(expr...)
+ push!(df, (; kind=$kind, name=$name, bench))
+ end))
+end
+
+const SSNode = Node{SubString{String}}
+
+#-----------------------------------------------------------------------------# Parse (small)
+@add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node)
+@add_benchmark "Parse (small)" "XML.jl (SS)" parse($small_xml, SSNode)
+@add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml)
+@add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml)
+@add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml)
+
+#-----------------------------------------------------------------------------# Parse (medium)
+@add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node)
+@add_benchmark "Parse (medium)" "XML.jl (SS)" parse($medium_xml, SSNode)
+@add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml)
+@add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml)
+@add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml)
+
+#-----------------------------------------------------------------------------# Write (small)
+@add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Write (medium)
+@add_benchmark "Write (medium)" "XML.jl" XML.write(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Write (medium)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Write (medium)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(medium_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Read from file
+@add_benchmark "Read file" "XML.jl" read($medium_file, Node)
+@add_benchmark "Read file" "EzXML" EzXML.readxml($medium_file)
+@add_benchmark "Read file" "LightXML" LightXML.parse_file($medium_file)
+
+#-----------------------------------------------------------------------------# Collect element tags
+function xml_collect_tags(node)
+ out = String[]
+ _xml_collect_tags!(out, node)
+ out
+end
+function _xml_collect_tags!(out, node)
+ for c in children(node)
+ if nodetype(c) === Element
+ push!(out, tag(c))
+ _xml_collect_tags!(out, c)
+ end
+ end
+end
+
+function ezxml_collect_tags(node::EzXML.Node)
+ out = String[]
+ _ezxml_collect_tags!(out, node)
+ out
+end
+function _ezxml_collect_tags!(out, node::EzXML.Node)
+ for child in EzXML.eachelement(node)
+ push!(out, child.name)
+ _ezxml_collect_tags!(out, child)
+ end
+end
+
+function lightxml_collect_tags(root::LightXML.XMLElement)
+ out = String[]
+ _lightxml_collect_tags!(out, root)
+ out
+end
+function _lightxml_collect_tags!(out, el::LightXML.XMLElement)
+ for child in LightXML.child_elements(el)
+ push!(out, LightXML.name(child))
+ _lightxml_collect_tags!(out, child)
+ end
+end
+
+@add_benchmark "Collect tags (small)" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Collect tags (small)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Collect tags (small)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o))
+
+@add_benchmark "Collect tags (medium)" "XML.jl" xml_collect_tags(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o))
+
+#-----------------------------------------------------------------------------# Write benchmarks_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
+
+function _compare_indicator(xml_ms, other_ms)
+ ratio = xml_ms / other_ms
+ pct = abs(round((ratio - 1) * 100, digits=1))
+ ratio > 1.05 ? "(XML.jl $(pct)% slower)" : ratio < 0.95 ? "(XML.jl $(pct)% faster)" : "(~same)"
+end
+
+outfile = joinpath(@__DIR__, "benchmarks_results.md")
+open(outfile, "w") do io
+ println(io, "# XML.jl Benchmarks\n")
+ println(io, "```")
+ for kind in unique(df.kind)
+ g = groupby(df, :kind)
+ haskey(g, (;kind)) || continue
+ sub = g[(;kind)]
+ println(io, kind)
+ # Find XML.jl baseline (first row starting with "XML.jl")
+ xml_row = findfirst(r -> startswith(r.name, "XML.jl") && !contains(r.name, "(SS)"), eachrow(sub))
+ xml_ms = isnothing(xml_row) ? nothing : median(sub[xml_row, :bench]).time / 1e6
+ for row in eachrow(sub)
+ ms = median(row.bench).time / 1e6
+ indicator = ""
+ if !isnothing(xml_ms) && !startswith(row.name, "XML.jl")
+ indicator = " " * _compare_indicator(xml_ms, ms)
+ end
+ println(io, "\t", rpad(row.name, 16), lpad(_fmt_ms(ms), 12), indicator)
+ end
+ println(io)
+ end
+ println(io, "```")
+
+ println(io, "\n```julia")
+ println(io, "versioninfo()")
+ buf = IOBuffer()
+ InteractiveUtils.versioninfo(buf)
+ for line in eachline(IOBuffer(take!(buf)))
+ println(io, "# ", line)
+ end
+ println(io, "```")
+end
+
+println("Results written to $outfile")
diff --git a/benchmarks/benchmarks_results.md b/benchmarks/benchmarks_results.md
new file mode 100644
index 0000000..d03ae50
--- /dev/null
+++ b/benchmarks/benchmarks_results.md
@@ -0,0 +1,60 @@
+# XML.jl Benchmarks
+
+```
+Parse (small)
+ XML.jl 0.0339 ms
+ XML.jl (SS) 0.0301 ms
+ EzXML 0.0254 ms (33.5% slower)
+ LightXML 0.0282 ms (20.1% slower)
+ XMLDict 0.204 ms (83.4% faster)
+
+Parse (medium)
+ XML.jl 170.0 ms
+ XML.jl (SS) 154.0 ms
+ EzXML 91.0 ms (87.0% slower)
+ LightXML 92.8 ms (83.4% slower)
+ XMLDict 623.0 ms (72.7% faster)
+
+Write (small)
+ XML.jl 0.0179 ms
+ EzXML 0.0107 ms (68.0% slower)
+ LightXML 0.0926 ms (80.6% faster)
+
+Write (medium)
+ XML.jl 81.2 ms
+ EzXML 73.2 ms (11.0% slower)
+ LightXML 55.1 ms (47.5% slower)
+
+Read file
+ XML.jl 180.0 ms
+ EzXML 129.0 ms (39.9% slower)
+ LightXML 104.0 ms (73.4% slower)
+
+Collect tags (small)
+ XML.jl 0.000597 ms
+ EzXML 0.00219 ms (72.7% faster)
+ LightXML 0.00371 ms (83.9% faster)
+
+Collect tags (medium)
+ XML.jl 12.2 ms
+ EzXML 28.2 ms (56.9% faster)
+ LightXML 25.7 ms (52.6% faster)
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl
new file mode 100644
index 0000000..7dca16e
--- /dev/null
+++ b/benchmarks/compare.jl
@@ -0,0 +1,224 @@
+#= Compare current dev XML.jl against the last released version.
+
+Usage:
+ julia benchmarks/compare.jl [tag]
+
+`tag` defaults to the latest git tag (e.g. v0.3.8).
+
+This script:
+1. Runs benchmarks using the current (dev) code
+2. Checks out the release tag into a temp worktree
+3. Runs the same benchmarks against that version
+4. Prints a side-by-side comparison
+=#
+
+using BenchmarkTools, Serialization, InteractiveUtils
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+const ROOT = dirname(@__DIR__)
+
+const RELEASE_TAG = if length(ARGS) >= 1
+ ARGS[1]
+else
+ tags = readlines(`git -C $ROOT tag --sort=version:refname`)
+ filter!(t -> startswith(t, "v"), tags)
+ last(tags)
+end
+
+const SMALL_FILE = joinpath(ROOT, "test", "data", "books.xml")
+const SMALL_XML = read(SMALL_FILE, String)
+
+# Generate medium file if needed
+include(joinpath(ROOT, "benchmarks", "XMarkGenerator.jl"))
+using .XMarkGenerator
+const MEDIUM_FILE = joinpath(ROOT, "benchmarks", "data", "xmark.xml")
+if !isfile(MEDIUM_FILE)
+ mkpath(dirname(MEDIUM_FILE))
+ @info "Generating XMark benchmark XML..."
+ generate_xmark(MEDIUM_FILE, 1.0)
+end
+const MEDIUM_XML = read(MEDIUM_FILE, String)
+
+#-----------------------------------------------------------------------------# Helpers
+function _collect_tags!(out, node)
+ for c in XML.children(node)
+ if XML.nodetype(c) === XML.Element
+ push!(out, XML.tag(c))
+ _collect_tags!(out, c)
+ end
+ end
+end
+
+function bench_collect_tags(node)
+ out = String[]
+ _collect_tags!(out, node)
+ out
+end
+
+#-----------------------------------------------------------------------------# Run dev benchmarks
+println("="^60)
+println(" XML.jl Benchmark Comparison")
+println(" Current (dev) vs $RELEASE_TAG")
+println("="^60)
+println()
+
+print("Running dev benchmarks...")
+flush(stdout)
+
+using XML
+
+dev_results = Dict{String, BenchmarkTools.Trial}()
+
+const SSNode = Node{SubString{String}}
+
+dev_small = parse(SMALL_XML, Node)
+dev_small_ss = parse(SMALL_XML, SSNode)
+dev_medium = parse(MEDIUM_XML, Node)
+dev_medium_ss = parse(MEDIUM_XML, SSNode)
+
+dev_results["Parse (small), String"] = @benchmark parse($SMALL_XML, Node)
+dev_results["Parse (small), SubString"] = @benchmark parse($SMALL_XML, SSNode)
+dev_results["Parse (medium), String"] = @benchmark parse($MEDIUM_XML, Node)
+dev_results["Parse (medium), SubString"] = @benchmark parse($MEDIUM_XML, SSNode)
+dev_results["Write (small)"] = @benchmark XML.write($dev_small)
+dev_results["Write (medium)"] = @benchmark XML.write($dev_medium)
+dev_results["Read file (medium), String"] = @benchmark read($MEDIUM_FILE, Node)
+dev_results["Read file (medium), SubString"] = @benchmark parse(read($MEDIUM_FILE, String), SSNode)
+dev_results["Collect tags (small), String"] = @benchmark bench_collect_tags($dev_small)
+dev_results["Collect tags (small), SubString"] = @benchmark bench_collect_tags($dev_small_ss)
+dev_results["Collect tags (medium), String"] = @benchmark bench_collect_tags($dev_medium)
+dev_results["Collect tags (medium), SubString"] = @benchmark bench_collect_tags($dev_medium_ss)
+
+println(" done")
+
+#-----------------------------------------------------------------------------# Run release benchmarks via temp worktree + separate process
+print("Setting up $RELEASE_TAG worktree...")
+flush(stdout)
+
+worktree_dir = mktempdir()
+run(pipeline(`git -C $ROOT worktree add $worktree_dir $RELEASE_TAG`, stdout=devnull, stderr=devnull))
+println(" done")
+
+release_results_file = joinpath(worktree_dir, "_results.jls")
+
+release_script = joinpath(worktree_dir, "_bench.jl")
+write(release_script, """
+using Pkg
+Pkg.activate(; temp=true)
+Pkg.develop(path=$(repr(worktree_dir)))
+Pkg.add("BenchmarkTools")
+Pkg.add("Serialization")
+
+using BenchmarkTools, Serialization, XML
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+small_xml = read($(repr(SMALL_FILE)), String)
+medium_xml = read($(repr(MEDIUM_FILE)), String)
+results = Dict{String, BenchmarkTools.Trial}()
+
+results["Parse (small)"] = @benchmark parse(\$small_xml, Node)
+
+try
+ SSNode = Node{SubString{String}}
+ results["Parse (small, SS)"] = @benchmark parse(\$small_xml, SSNode)
+ results["Parse (medium, SS)"] = @benchmark parse(\$medium_xml, SSNode)
+catch
+end
+
+results["Parse (medium)"] = @benchmark parse(\$medium_xml, Node)
+
+small_node = parse(small_xml, Node)
+medium_node = parse(medium_xml, Node)
+results["Write (small)"] = @benchmark XML.write(\$small_node)
+results["Write (medium)"] = @benchmark XML.write(\$medium_node)
+results["Read file (medium)"] = @benchmark read($(repr(MEDIUM_FILE)), Node)
+
+function _collect_tags!(out, node)
+ for c in XML.children(node)
+ if XML.nodetype(c) === XML.Element
+ push!(out, XML.tag(c))
+ _collect_tags!(out, c)
+ end
+ end
+end
+function bench_collect_tags(node)
+ out = String[]
+ _collect_tags!(out, node)
+ out
+end
+results["Collect tags (small)"] = @benchmark bench_collect_tags(\$small_node)
+results["Collect tags (medium)"] = @benchmark bench_collect_tags(\$medium_node)
+
+serialize($(repr(release_results_file)), results)
+""")
+
+print("Running $RELEASE_TAG benchmarks...")
+flush(stdout)
+run(pipeline(`julia $release_script`, stdout=devnull, stderr=devnull))
+release_results = deserialize(release_results_file)
+println(" done")
+
+# Cleanup worktree
+run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull))
+
+#-----------------------------------------------------------------------------# Write compare_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
+
+function _compare_indicator(dev_ms, rel_ms)
+ change = (dev_ms / rel_ms - 1) * 100
+ pct = abs(round(change, digits=1))
+ change < -5 ? "($(pct)% faster)" : change > 5 ? "($(pct)% slower)" : "(~same)"
+end
+
+groups = [
+ ("Parse (small)", "Parse (small)", ["Parse (small), String", "Parse (small), SubString"]),
+ ("Parse (medium)", "Parse (medium)", ["Parse (medium), String", "Parse (medium), SubString"]),
+ ("Write (small)", "Write (small)", ["Write (small)"]),
+ ("Write (medium)", "Write (medium)", ["Write (medium)"]),
+ ("Read file (medium)", "Read file (medium)", ["Read file (medium), String", "Read file (medium), SubString"]),
+ ("Collect tags (small)", "Collect tags (small)", ["Collect tags (small), String", "Collect tags (small), SubString"]),
+ ("Collect tags (medium)","Collect tags (medium)", ["Collect tags (medium), String", "Collect tags (medium), SubString"]),
+]
+
+outfile = joinpath(@__DIR__, "compare_results.md")
+open(outfile, "w") do io
+ println(io, "# XML.jl Benchmark Comparison: dev vs $RELEASE_TAG\n")
+ println(io, "```")
+ for (title, rel_key, dev_keys) in groups
+ rel_ms = haskey(release_results, rel_key) ? median(release_results[rel_key]).time / 1e6 : nothing
+ any(k -> haskey(dev_results, k), dev_keys) || (isnothing(rel_ms) && continue)
+
+ println(io, title)
+ if !isnothing(rel_ms)
+ println(io, "\t", rpad(RELEASE_TAG, 16), lpad(_fmt_ms(rel_ms), 12))
+ end
+ for dk in dev_keys
+ haskey(dev_results, dk) || continue
+ dev_ms = median(dev_results[dk]).time / 1e6
+ label = occursin(", ", dk) ? split(dk, ", "; limit=2)[2] : "dev"
+ ms_str = lpad(_fmt_ms(dev_ms), 12)
+ if isnothing(rel_ms)
+ println(io, "\t", rpad(label, 16), ms_str)
+ else
+ println(io, "\t", rpad(label, 16), ms_str, " ", _compare_indicator(dev_ms, rel_ms))
+ end
+ end
+ println(io)
+ end
+ println(io, "```")
+
+ println(io, "\n```julia")
+ println(io, "versioninfo()")
+ buf = IOBuffer()
+ InteractiveUtils.versioninfo(buf)
+ for line in eachline(IOBuffer(take!(buf)))
+ println(io, "# ", line)
+ end
+ println(io, "```")
+end
+
+println("Results written to $outfile")
diff --git a/benchmarks/compare_results.md b/benchmarks/compare_results.md
new file mode 100644
index 0000000..135035e
--- /dev/null
+++ b/benchmarks/compare_results.md
@@ -0,0 +1,54 @@
+# XML.jl Benchmark Comparison: dev vs v0.3.8
+
+```
+Parse (small)
+ v0.3.8 0.116 ms
+ String 0.0351 ms (69.7% faster)
+ SubString 0.0309 ms (73.4% faster)
+
+Parse (medium)
+ v0.3.8 736.0 ms
+ String 170.0 ms (76.9% faster)
+ SubString 162.0 ms (77.9% faster)
+
+Write (small)
+ v0.3.8 0.0257 ms
+ dev 0.0212 ms (17.7% faster)
+
+Write (medium)
+ v0.3.8 154.0 ms
+ dev 84.6 ms (44.9% faster)
+
+Read file (medium)
+ v0.3.8 714.0 ms
+ String 177.0 ms (75.2% faster)
+ SubString 171.0 ms (76.1% faster)
+
+Collect tags (small)
+ v0.3.8 0.000527 ms
+ String 0.000614 ms (16.5% slower)
+ SubString 0.00177 ms (235.1% slower)
+
+Collect tags (medium)
+ v0.3.8 25.0 ms
+ String 10.9 ms (56.4% faster)
+ SubString 16.0 ms (36.0% faster)
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.5
+# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/dict_benchmarks.jl b/benchmarks/dict_benchmarks.jl
new file mode 100644
index 0000000..7dd90a3
--- /dev/null
+++ b/benchmarks/dict_benchmarks.jl
@@ -0,0 +1,71 @@
+using XML
+using BenchmarkTools
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+
+#-----------------------------------------------------------------------------# Setup
+sizes = [2, 5, 10, 20]
+
+function make_xml(n::Int)
+ attrs = join((" attr$i=\"value$i\"" for i in 1:n))
+ " "
+end
+
+function make_pairs(n::Int)
+ Pair{String,String}["attr$i" => "value$i" for i in 1:n]
+end
+
+pt(t) = BenchmarkTools.prettytime(t)
+
+function printrow(n, op, t_dict, t_attr)
+ pct = round(100 * (t_dict - t_attr) / t_dict, digits=1)
+ label = pct > 0 ? "$(pct)% faster" : "$(-pct)% slower"
+ println(rpad("$n attrs", 10), " | ", rpad(op, 22), " | ",
+ rpad("Dict $(pt(t_dict))", 22), " | ",
+ rpad("Attributes $(pt(t_attr))", 26), " | ", label)
+end
+
+#-----------------------------------------------------------------------------# Benchmarks
+println("=" ^ 110)
+println(" Attributes vs Dict Benchmarks")
+println("=" ^ 110)
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | ",
+ rpad("Dict", 22), " | ", rpad("Attributes", 26), " | Change")
+println("-" ^ 110)
+
+for n in sizes
+ pairs = make_pairs(n)
+ d = Dict(pairs)
+ a = XML.Attributes(pairs)
+ key_mid = "attr$(n ÷ 2 + 1)"
+ key_last = "attr$n"
+
+ tests = [
+ ("construct", () -> @benchmark(Dict($pairs)), () -> @benchmark(XML.Attributes($pairs))),
+ ("getindex [mid]", () -> @benchmark($d[$key_mid]), () -> @benchmark($a[$key_mid])),
+ ("getindex [last]", () -> @benchmark($d[$key_last]), () -> @benchmark($a[$key_last])),
+ ("get [miss]", () -> @benchmark(get($d, "nope", nothing)), () -> @benchmark(get($a, "nope", nothing))),
+ ("haskey [hit]", () -> @benchmark(haskey($d, $key_mid)), () -> @benchmark(haskey($a, $key_mid))),
+ ("keys", () -> @benchmark(collect(keys($d))), () -> @benchmark(keys($a))),
+ ("iterate", () -> @benchmark(sum(length(v) for (_,v) in $d)), () -> @benchmark(sum(length(v) for (_,v) in $a))),
+ ]
+
+ for (op, bench_dict, bench_attr) in tests
+ t_dict = median(bench_dict()).time
+ t_attr = median(bench_attr()).time
+ printrow(n, op, t_dict, t_attr)
+ end
+ println("-" ^ 110)
+end
+
+#-----------------------------------------------------------------------------# End-to-end: attributes() call on parsed Node
+println()
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | Time")
+println("-" ^ 50)
+for n in sizes
+ doc = parse(make_xml(n), Node)
+ el = doc[1]
+ t = median(@benchmark(attributes($el))).time
+ println(rpad("$n attrs", 10), " | ", rpad("attributes(node)", 22), " | ", pt(t))
+end
+println()
diff --git a/benchmarks/suite.jl b/benchmarks/suite.jl
deleted file mode 100644
index e06dc61..0000000
--- a/benchmarks/suite.jl
+++ /dev/null
@@ -1,74 +0,0 @@
-using Pkg
-Pkg.activate(@__DIR__)
-
-using XML
-using EzXML: EzXML
-using XMLDict: XMLDict
-using BenchmarkTools
-using DataFrames
-using UnicodePlots
-using OrderedCollections: OrderedDict
-
-
-BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
-BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
-
-
-# nasa.xml was downloaded from:
-# http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/www/repository.html#nasa
-file = joinpath(@__DIR__, "nasa.xml")
-
-df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
-
-macro add_benchmark(kind, name, expr...)
- esc(:(let
- @info string($kind, " - ", $name)
- bench = @benchmark $(expr...)
- push!(df, (; kind=$kind, name=$name, bench))
- end))
-end
-
-#-----------------------------------------------------------------------------# Write
-@add_benchmark "Write" "XML.write" XML.write($(tempname()), o) setup = (o = read(file, Node))
-@add_benchmark "Write" "EzXML.writexml" EzXML.write($(tempname()), o) setup = (o = EzXML.readxml(file))
-
-#-----------------------------------------------------------------------------# Read
-@add_benchmark "Read" "XML.LazyNode" read($file, LazyNode)
-@add_benchmark "Read" "XML.Node" read($file, Node)
-@add_benchmark "Read" "EzXML.readxml" EzXML.readxml($file)
-@add_benchmark "Read" "XMLDict.xml_dict" XMLDict.xml_dict(read($file, String))
-
-#-----------------------------------------------------------------------------# Lazy Iteration
-@add_benchmark "Lazy Iteration" "LazyNode" for x in read($file, LazyNode); end
-@add_benchmark "Lazy Iteration" "EzXML.StreamReader" (reader = open(EzXML.StreamReader, $file); for x in reader; end; close(reader))
-
-#-----------------------------------------------------------------------------# Lazy Iteration: Collect Tags
-@add_benchmark "Collect Tags" "LazyNode" [tag(x) for x in o] setup = (o = read(file, LazyNode))
-@add_benchmark "Collect Tags" "EzXML.StreamReader" [r.name for x in r if x == EzXML.READER_ELEMENT] setup=(r=open(EzXML.StreamReader, file)) teardown=(close(r))
-
-function get_tags(o::EzXML.Node)
- out = String[]
- for node in EzXML.eachelement(o)
- push!(out, node.name)
- for tag in get_tags(node)
- push!(out, tag)
- end
- end
- out
-end
-@add_benchmark "Collect Tags" "EzXML.readxml" get_tags(o.root) setup=(o = EzXML.readxml(file))
-
-
-#-----------------------------------------------------------------------------# Plots
-function plot(df, kind)
- g = groupby(df, :kind)
- sub = g[(;kind)]
- x = map(row -> "$(row.name)", eachrow(sub))
- y = map(x -> median(x).time / 1000^2, sub.bench)
- display(barplot(x, y, title = "$kind Time (ms)", border=:none, width=50))
-end
-
-plot(df, "Read")
-plot(df, "Write")
-plot(df, "Lazy Iteration")
-plot(df, "Collect Tags")
diff --git a/src/XML.jl b/src/XML.jl
index 273bfda..f36905f 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -1,31 +1,61 @@
module XML
-using Mmap
-using OrderedCollections: OrderedDict
-
export
- # Core Types:
- Node, LazyNode,
- # Interface:
- children, nodetype, tag, attributes, value, is_simple, simplevalue, simple_value,
- # Extended Interface for LazyNode:
- parent, depth, next, prev
+ Node, LazyNode, NodeType, Attributes,
+ CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
+ nodetype, tag, attributes, value, children,
+ is_simple, simple_value,
+ depth, siblings,
+ xpath,
+ h
+
+include("XMLTokenizer.jl")
+using .XMLTokenizer:
+ tokenize, tag_name, attr_value, pi_target, TokenKind, Token, Tokenizer, TokenizerState,
+ TOKEN_TEXT, TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE,
+ TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE,
+ TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE,
+ TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE,
+ TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE,
+ TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE,
+ TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE
#-----------------------------------------------------------------------------# escape/unescape
-const escape_chars = ('&' => "&", '<' => "<", '>' => ">", "'" => "'", '"' => """)
-function unescape(x::AbstractString)
- result = x
- for (pat, r) in reverse.(escape_chars)
- result = replace(result, pat => r)
- end
- return result
+const escape_chars = ('&' => "&", '<' => "<", '>' => ">", '\'' => "'", '"' => """)
+
+"""
+ escape(x::AbstractString) -> String
+
+Escape the five XML predefined entities: `&` `<` `>` `'` `"`.
+
+!!! note "Changed in v0.4"
+ `escape` is no longer idempotent. In previous versions, already-escaped sequences like
+ `&` were left untouched. Now every `&` is escaped, so `escape("&")` produces
+ `"&"`. Call `escape` only on raw, unescaped text.
+"""
+escape(x::AbstractString) = replace(x, escape_chars...)
+
+# Replace a numeric character reference with its Unicode character.
+# Numeric character references encode characters by code point: decimal (é → é) or hex (é → é).
+function _unescape_charref(ref::AbstractString)
+ is_hex = length(ref) > 3 && ref[3] in ('x', 'X')
+ digits = SubString(ref, is_hex ? 4 : 3, length(ref) - 1)
+ cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10)
+ !isnothing(cp) && isvalid(Char, cp) ? string(Char(cp)) : ref
end
-function escape(x::String)
- result = replace(x, r"&(?!amp;|quot;|apos;|gt;|lt;)" => "&")
- for (pat, r) in escape_chars[2:end]
- result = replace(result, pat => r)
- end
- return result
+
+"""
+ unescape(x::AbstractString) -> String
+
+Unescape XML entities in `x`: the five predefined entities (`&` `<` `>` `'`
+`"`) and numeric character references (`{`, `«`). Each reference is processed
+exactly once (no double-unescaping).
+"""
+function unescape(x::AbstractString)
+ occursin('&', x) || return string(x)
+ s = string(x)
+ occursin("", s) && (s = replace(s, r"[xX]?[0-9a-fA-F]+;" => _unescape_charref))
+ replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&")
end
#-----------------------------------------------------------------------------# NodeType
@@ -34,9 +64,9 @@ end
- Document # prolog & root Element
- DTD #
- Declaration #
- - ProcessingInstruction #
+ - ProcessingInstruction #
- Comment #
- - CData #
+ - CData #
- Element # children...
- Text # text
@@ -45,381 +75,952 @@ NodeTypes can be used to construct XML.Nodes:
Document(children...)
DTD(value)
Declaration(; attributes)
- ProcessingInstruction(tag, attributes)
+ ProcessingInstruction(tag, content)
Comment(text)
CData(text)
Element(tag, children...; attributes)
Text(text)
"""
-@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text)
+@enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text
+#-----------------------------------------------------------------------------# Attributes
+"""
+ Attributes{S} <: AbstractDict{S, S}
-#-----------------------------------------------------------------------------# includes
-include("raw.jl")
-include("dtd.jl")
+An ordered dictionary of XML attributes backed by a `Vector{Pair{S, S}}`.
+Returned by [`attributes`](@ref). Preserves insertion order and supports the
+full `AbstractDict` interface (`get`, `haskey`, `keys`, `values`, iteration, etc.).
+"""
+struct Attributes{S} <: AbstractDict{S, S}
+ entries::Vector{Pair{S, S}}
+end
-abstract type AbstractXMLNode end
+Base.length(a::Attributes) = length(a.entries)
+Base.iterate(a::Attributes, state...) = iterate(a.entries, state...)
-#-----------------------------------------------------------------------------# LazyNode
-"""
- LazyNode(file::AbstractString)
- LazyNode(data::XML.Raw)
+function Base.getindex(a::Attributes, key::AbstractString)
+ for (k, v) in a.entries
+ k == key && return v
+ end
+ throw(KeyError(key))
+end
-A Lazy representation of an XML node.
-"""
-mutable struct LazyNode <: AbstractXMLNode
- raw::Raw
- tag::Union{Nothing, String}
- attributes::Union{Nothing, OrderedDict{String, String}}
- value::Union{Nothing, String}
+function Base.get(a::Attributes, key::AbstractString, default)
+ for (k, v) in a.entries
+ k == key && return v
+ end
+ default
end
-LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing)
-function Base.getproperty(o::LazyNode, x::Symbol)
- x === :raw && return getfield(o, :raw)
- x === :nodetype && return nodetype(o.raw)
- x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x)
- x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x)
- x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x)
- x === :depth && return depth(o.raw)
- x === :children && return LazyNode.(children(o.raw))
- error("type LazyNode has no field $(x)")
+function Base.haskey(a::Attributes, key::AbstractString)
+ any(p -> first(p) == key, a.entries)
+end
+
+Base.keys(a::Attributes) = first.(a.entries)
+Base.values(a::Attributes) = last.(a.entries)
+
+#-----------------------------------------------------------------------------# Node
+struct Node{S}
+ nodetype::NodeType
+ tag::Union{Nothing, S}
+ attributes::Union{Nothing, Vector{Pair{S, S}}}
+ value::Union{Nothing, S}
+ children::Union{Nothing, Vector{Node{S}}}
+
+ function Node{S}(nodetype::NodeType, tag, attributes, value, children) where {S}
+ if nodetype in (Text, Comment, CData, DTD)
+ isnothing(tag) && isnothing(attributes) && !isnothing(value) && isnothing(children) ||
+ error("$nodetype nodes only accept a value.")
+ elseif nodetype === Element
+ !isnothing(tag) && isnothing(value) ||
+ error("Element nodes require a tag and no value.")
+ elseif nodetype === Declaration
+ isnothing(tag) && isnothing(value) && isnothing(children) ||
+ error("Declaration nodes only accept attributes.")
+ elseif nodetype === ProcessingInstruction
+ !isnothing(tag) && isnothing(attributes) && isnothing(children) ||
+ error("ProcessingInstruction nodes require a tag and only accept a value.")
+ elseif nodetype === Document
+ isnothing(tag) && isnothing(attributes) && isnothing(value) ||
+ error("Document nodes only accept children.")
+ end
+ new{S}(nodetype, tag, attributes, value, children)
+ end
end
-Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children)
-Base.show(io::IO, o::LazyNode) = _show_node(io, o)
+#-----------------------------------------------------------------------------# interface
+nodetype(o::Node) = o.nodetype
+tag(o::Node) = o.tag
+
+"""
+ attributes(node::Node) -> Union{Nothing, Attributes{String}}
+
+Return the attributes of an `Element` or `Declaration` node as an [`Attributes`](@ref) dict,
+or `nothing` if the node has no attributes.
-Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw))
-Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw))
-Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw))
+!!! note "Changed in v0.4"
+ In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl.
+ It now returns an [`Attributes`](@ref), an ordered `AbstractDict` backed by a
+ `Vector{Pair}`.
+"""
+attributes(o::Node) = isnothing(o.attributes) ? nothing : Attributes(o.attributes)
+
+value(o::Node) = o.value
+children(o::Node) = something(o.children, ())
+
+is_simple(o::Node) = o.nodetype === Element &&
+ (isnothing(o.attributes) || isempty(o.attributes)) &&
+ !isnothing(o.children) && length(o.children) == 1 &&
+ o.children[1].nodetype in (Text, CData)
+
+simple_value(o::Node) = is_simple(o) ? o.children[1].value :
+ error("`simple_value` is only defined for simple nodes.")
+
+#-----------------------------------------------------------------------------# tree navigation
+
+"""
+ parent(child::Node, root::Node) -> Node
-children(o::LazyNode) = LazyNode.(children(o.raw))
-parent(o::LazyNode) = LazyNode(parent(o.raw))
-depth(o::LazyNode) = depth(o.raw)
+Return the parent of `child` within the tree rooted at `root`.
-Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown()
-Base.eltype(::Type{LazyNode}) = LazyNode
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found or if `child === root`.
+"""
+function Base.parent(child::Node, root::Node)
+ child === root && error("Root node has no parent.")
+ result = _find_parent(child, root)
+ isnothing(result) && error("Node not found in tree.")
+ result
+end
-function Base.iterate(o::LazyNode, state=o)
- n = next(state)
- return isnothing(n) ? nothing : (n, n)
+function _find_parent(child::Node, current::Node)
+ for c in children(current)
+ c === child && return current
+ result = _find_parent(child, c)
+ isnothing(result) || return result
+ end
+ nothing
end
-function next(o::LazyNode)
- n = next(o.raw)
- isnothing(n) && return nothing
- n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n)
+"""
+ depth(child::Node, root::Node) -> Int
+
+Return the depth of `child` within the tree rooted at `root` (root has depth 0).
+
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found in the tree.
+"""
+function depth(child::Node, root::Node)
+ child === root && return 0
+ result = _find_depth(child, root, 0)
+ isnothing(result) && error("Node not found in tree.")
+ result
end
-function prev(o::LazyNode)
- n = prev(o.raw)
- isnothing(n) && return nothing
- n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
+
+function _find_depth(child::Node, current::Node, d::Int)
+ for c in children(current)
+ c === child && return d + 1
+ result = _find_depth(child, c, d + 1)
+ isnothing(result) || return result
+ end
+ nothing
end
-#-----------------------------------------------------------------------------# Node
"""
- Node(nodetype, tag, attributes, value, children)
- Node(node::Node; kw...) # copy node with keyword overrides
- Node(node::LazyNode) # un-lazy the LazyNode
+ siblings(child::Node, root::Node) -> Vector{Node}
+
+Return the siblings of `child` (other children of the same parent) within the tree rooted
+at `root`. The returned vector does not include `child` itself.
-A representation of an XML DOM node. For simpler construction, use `(::NodeType)(args...)`
+Throws an error if `child` is the root or is not found in the tree.
"""
-struct Node <: AbstractXMLNode
- nodetype::NodeType
- tag::Union{Nothing, String}
- attributes::Union{Nothing, OrderedDict{String, String}}
- value::Union{Nothing, String}
- children::Union{Nothing, Vector{Node}}
-
- function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing)
- new(nodetype,
- isnothing(tag) ? nothing : string(tag),
- isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)),
- isnothing(value) ? nothing : string(value),
- isnothing(children) ? nothing :
- children isa Node ? [children] :
- children isa Vector{Node} ? children :
- children isa Vector ? map(Node, children) :
- children isa Tuple ? map(Node, collect(children)) :
- [Node(children)]
- )
+function siblings(child::Node, root::Node)
+ p = parent(child, root)
+ [c for c in children(p) if c !== child]
+end
+
+include("xpath.jl")
+include("lazynode.jl")
+
+
+#-----------------------------------------------------------------------------# _to_node
+_to_node(n::Node{String}) = n
+_to_node(n::Node) = throw(ArgumentError("Expected Node{String}, got $(typeof(n))"))
+_to_node(x) = Node{String}(Text, nothing, nothing, string(x), nothing)
+
+#-----------------------------------------------------------------------------# NodeType constructors
+function (T::NodeType)(args...; attrs...)
+ S = String
+ if T in (Text, Comment, CData, DTD)
+ length(args) == 1 || error("$T nodes require exactly one value argument.")
+ !isempty(attrs) && error("$T nodes do not accept attributes.")
+ Node{S}(T, nothing, nothing, string(only(args)), nothing)
+ elseif T === Element
+ isempty(args) && error("Element nodes require at least a tag.")
+ t = string(first(args))
+ a = Pair{S,S}[String(k) => String(v) for (k, v) in pairs(attrs)]
+ c = Node{S}[_to_node(x) for x in args[2:end]]
+ Node{S}(T, t, a, nothing, c)
+ elseif T === Declaration
+ !isempty(args) && error("Declaration nodes only accept keyword attributes.")
+ a = isempty(attrs) ? nothing : [String(k) => String(v) for (k, v) in pairs(attrs)]
+ Node{S}(T, nothing, a, nothing, nothing)
+ elseif T === ProcessingInstruction
+ length(args) >= 1 || error("ProcessingInstruction nodes require a target.")
+ length(args) <= 2 || error("ProcessingInstruction nodes accept a target and optional content.")
+ !isempty(attrs) && error("ProcessingInstruction nodes do not accept attributes.")
+ t = string(args[1])
+ v = length(args) == 2 ? string(args[2]) : nothing
+ Node{S}(T, t, nothing, v, nothing)
+ elseif T === Document
+ !isempty(attrs) && error("Document nodes do not accept attributes.")
+ c = Node{S}[_to_node(x) for x in args]
+ Node{S}(T, nothing, nothing, nothing, c)
end
end
-function Node(o::Node, x...; kw...)
- attrs = !isnothing(kw) ?
- merge(
- OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
- isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
- ) :
- o.attributes
- children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
- Node(o.nodetype, o.tag, attrs, o.value, children)
+#-----------------------------------------------------------------------------# equality
+_eq(::Nothing, ::Nothing) = true
+_eq(::Nothing, b) = isempty(b)
+_eq(a, ::Nothing) = isempty(a)
+_eq(a, b) = a == b
+
+# Attribute equality is order-insensitive per XML spec
+function _attrs_eq(a, b)
+ a_empty = isnothing(a) || isempty(a)
+ b_empty = isnothing(b) || isempty(b)
+ a_empty && b_empty && return true
+ (a_empty != b_empty) && return false
+ length(a) != length(b) && return false
+ for p in a
+ p in b || return false
+ end
+ true
end
-function Node(node::LazyNode)
- nodetype = node.nodetype
- tag = node.tag
- attributes = node.attributes
- value = node.value
- c = XML.children(node)
- Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
+function Base.:(==)(a::Node, b::Node)
+ a.nodetype == b.nodetype &&
+ a.tag == b.tag &&
+ _attrs_eq(a.attributes, b.attributes) &&
+ a.value == b.value &&
+ _eq(a.children, b.children)
end
-Node(data::Raw) = Node(LazyNode(data))
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(o::Node, i::Integer) = children(o)[i]
+Base.getindex(o::Node, ::Colon) = children(o)
+Base.lastindex(o::Node) = lastindex(children(o))
+Base.only(o::Node) = only(children(o))
+Base.length(o::Node) = length(children(o))
-# Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node
-Node(x) = Node(Text, nothing, nothing, string(x), nothing)
+function Base.get(o::Node, key::AbstractString, default)
+ isnothing(o.attributes) && return default
+ for (k, v) in o.attributes
+ k == key && return v
+ end
+ default
+end
-h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children)
-Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
-(o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...)
+const _MISSING_ATTR = gensym(:missing_attr)
-# NOT in-place for Text Nodes
-function escape!(o::Node, warn::Bool=true)
- if o.nodetype == Text
- warn && @warn "escape!() called on a Text Node creates a new node."
- return Text(escape(o.value))
+function Base.getindex(o::Node, key::AbstractString)
+ val = get(o, key, _MISSING_ATTR)
+ val === _MISSING_ATTR && throw(KeyError(key))
+ val
+end
+
+function Base.haskey(o::Node, key::AbstractString)
+ get(o, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+Base.keys(o::Node) = isnothing(o.attributes) ? () : first.(o.attributes)
+
+#-----------------------------------------------------------------------------# mutation
+function Base.setindex!(o::Node, val, i::Integer)
+ isnothing(o.children) && error("Node has no children.")
+ o.children[i] = _to_node(val)
+end
+
+function Base.setindex!(o::Node, val, key::AbstractString)
+ isnothing(o.attributes) && error("Node has no attributes.")
+ v = string(val)
+ for i in eachindex(o.attributes)
+ if first(o.attributes[i]) == key
+ o.attributes[i] = key => v
+ return v
+ end
end
- isnothing(o.children) && return o
- map!(x -> escape!(x, false), o.children, o.children)
- o
+ push!(o.attributes, key => v)
+ v
end
-function unescape!(o::Node, warn::Bool=true)
- if o.nodetype == Text
- warn && @warn "unescape!() called on a Text Node creates a new node."
- return Text(unescape(o.value))
+
+function Base.push!(a::Node, b)
+ isnothing(a.children) && error("Node does not accept children.")
+ push!(a.children, _to_node(b))
+ a
+end
+
+function Base.pushfirst!(a::Node, b)
+ isnothing(a.children) && error("Node does not accept children.")
+ pushfirst!(a.children, _to_node(b))
+ a
+end
+
+#-----------------------------------------------------------------------------# show (REPL)
+function Base.show(io::IO, o::Node)
+ nt = o.nodetype
+ print(io, nt)
+ if nt === Text
+ print(io, ' ', repr(o.value))
+ elseif nt === Element
+ print(io, " <", o.tag)
+ if !isnothing(o.attributes)
+ for (k, v) in o.attributes
+ print(io, ' ', k, '=', '"', v, '"')
+ end
+ end
+ print(io, '>')
+ n = length(children(o))
+ n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
+ elseif nt === DTD
+ print(io, " ')
+ elseif nt === Declaration
+ print(io, " ")
+ elseif nt === ProcessingInstruction
+ print(io, " ", o.tag)
+ !isnothing(o.value) && print(io, ' ', o.value)
+ print(io, "?>")
+ elseif nt === Comment
+ print(io, " ")
+ elseif nt === CData
+ print(io, " ")
+ elseif nt === Document
+ n = length(children(o))
+ n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
+ end
+end
+
+#-----------------------------------------------------------------------------# show (text/xml)
+
+# Write XML-escaped content directly to IO (single pass, no intermediate string)
+function _write_escaped(io::IO, s::String)
+ start = 1
+ i = 1
+ n = ncodeunits(s)
+ @inbounds while i <= n
+ b = codeunit(s, i)
+ esc = if b == UInt8('&'); "&"
+ elseif b == UInt8('<'); "<"
+ elseif b == UInt8('>'); ">"
+ elseif b == UInt8('"'); """
+ elseif b == UInt8('\''); "'"
+ else
+ i += 1
+ continue
+ end
+ i > start && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (i - start) % UInt)
+ print(io, esc)
+ i += 1
+ start = i
end
- isnothing(o.children) && return o
- map!(x -> unescape!(x, false), o.children, o.children)
- o
+ start <= n && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (n - start + 1) % UInt)
+ nothing
end
+# Cached indentation strings to avoid repeated allocation
+const _MAX_CACHED_INDENT = 64
+const _INDENT_STRINGS = [" " ^ n for n in 0:_MAX_CACHED_INDENT]
+@inline function _indent_str(n::Int)
+ 0 <= n <= _MAX_CACHED_INDENT && return @inbounds _INDENT_STRINGS[n + 1]
+ " " ^ n
+end
+
+function _print_attrs(io::IO, attributes)
+ isnothing(attributes) && return
+ for (k, v) in attributes
+ print(io, ' ', k, "=\"")
+ _write_escaped(io, v)
+ print(io, '"')
+ end
+end
-Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
-Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
-Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw))
+function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false)
+ pad = preserve ? "" : _indent_str(indent * depth)
+ nt = node.nodetype
+ if nt === Text
+ _write_escaped(io, node.value)
+ elseif nt === Element
+ # Check xml:space on this element
+ child_preserve = preserve
+ if !isnothing(node.attributes)
+ for (k, v) in node.attributes
+ k == "xml:space" && (child_preserve = v == "preserve")
+ end
+ end
+ print(io, pad, '<', node.tag)
+ _print_attrs(io, node.attributes)
+ ch = node.children
+ if isnothing(ch) || isempty(ch)
+ print(io, "/>")
+ elseif length(ch) == 1 && only(ch).nodetype === Text
+ print(io, '>')
+ _write_xml(io, only(ch), 0, 0, child_preserve)
+ print(io, "", node.tag, '>')
+ else
+ child_preserve ? print(io, '>') : println(io, '>')
+ for child in ch
+ _write_xml(io, child, depth + 1, indent, child_preserve)
+ child_preserve || println(io)
+ end
+ print(io, child_preserve ? "" : pad, "", node.tag, '>')
+ end
+ elseif nt === Declaration
+ print(io, pad, "")
+ elseif nt === ProcessingInstruction
+ print(io, pad, "", node.tag)
+ isnothing(node.value) || print(io, ' ', node.value)
+ print(io, "?>")
+ elseif nt === Comment
+ print(io, pad, "")
+ elseif nt === CData
+ print(io, pad, "")
+ elseif nt === DTD
+ print(io, pad, "')
+ elseif nt === Document
+ ch = node.children
+ if !isnothing(ch)
+ for (i, child) in enumerate(ch)
+ _write_xml(io, child, 0, indent, preserve)
+ i < length(ch) && println(io)
+ end
+ end
+ end
+end
-Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
-Base.push!(a::Node, b::Node) = push!(a.children, b)
-Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b)
+Base.show(io::IO, ::MIME"text/xml", node::Node) = _write_xml(io, node)
-Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val))
-Base.getindex(o::Node, val::AbstractString) = o.attributes[val]
-Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key)
-Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes)
+#-----------------------------------------------------------------------------# write / read
+write(node::Node; indentsize::Int=2) = (io = IOBuffer(); _write_xml(io, node, 0, indentsize); String(take!(io)))
+write(filename::AbstractString, node::Node; kw...) = open(io -> write(io, node; kw...), filename, "w")
+write(io::IO, node::Node; indentsize::Int=2) = _write_xml(io, node, 0, indentsize)
-Base.show(io::IO, o::Node) = _show_node(io, o)
+Base.read(filename::AbstractString, ::Type{Node}) = parse(read(filename, String), Node)
+Base.read(io::IO, ::Type{Node}) = parse(read(io, String), Node)
-#-----------------------------------------------------------------------------# Node Constructors
-function (T::NodeType)(args...; attr...)
- if T === Document
- !isempty(attr) && error("Document nodes do not have attributes.")
- Node(T, nothing, nothing, nothing, args)
- elseif T === DTD
- !isempty(attr) && error("DTD nodes only accept a value.")
- length(args) > 1 && error("DTD nodes only accept a value.")
- Node(T, nothing, nothing, only(args))
- elseif T === Declaration
- !isempty(args) && error("Declaration nodes only accept attributes")
- Node(T, nothing, attr)
- elseif T === ProcessingInstruction
- length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.")
- Node(T, only(args), attr)
- elseif T === Comment
- !isempty(attr) && error("Comment nodes do not have attributes.")
- length(args) > 1 && error("Comment nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === CData
- !isempty(attr) && error("CData nodes do not have attributes.")
- length(args) > 1 && error("CData nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === Text
- !isempty(attr) && error("Text nodes do not have attributes.")
- length(args) > 1 && error("Text nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === Element
- tag = first(args)
- Node(T, tag, attr, nothing, args[2:end])
- else
- error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).")
+#-----------------------------------------------------------------------------# parse
+Base.parse(::Type{Node}, xml::AbstractString) = parse(xml, Node)
+
+function Base.parse(xml::AbstractString, ::Type{Node})
+ _parse(String(xml), String, unescape)
+end
+
+function Base.parse(xml::AbstractString, ::Type{Node{SubString{String}}})
+ _parse(String(xml), SubString{String}, identity)
+end
+
+_to(::Type{String}, s::AbstractString) = String(s)
+_to(::Type{SubString{String}}, s::SubString{String}) = s
+
+_nothingify(v::Vector) = isempty(v) ? nothing : v
+
+function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
+ tags = S[]
+ attrs_stack = Vector{Pair{S,S}}[]
+ children_stack = Vector{Vector{Node{S}}}()
+ push!(children_stack, Node{S}[])
+
+ pending_attr_name = SubString(xml, 1, 0)
+ decl_attrs = nothing
+ pending_pi_tag = SubString(xml, 1, 0)
+ pending_pi_value = nothing
+ in_close_tag = false
+
+ for token in tokenize(xml)
+ k = token.kind
+
+ if k === TOKEN_TEXT
+ push!(last(children_stack), Node{S}(Text, nothing, nothing, convert_text(token.raw), nothing))
+
+ elseif k === TOKEN_OPEN_TAG
+ push!(tags, _to(S, tag_name(token)))
+ push!(attrs_stack, Pair{S,S}[])
+ push!(children_stack, Node{S}[])
+
+ elseif k === TOKEN_SELF_CLOSE
+ t = pop!(tags)
+ a = pop!(attrs_stack)
+ pop!(children_stack)
+ push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, nothing))
+
+ elseif k === TOKEN_TAG_CLOSE
+ in_close_tag && (in_close_tag = false)
+
+ elseif k === TOKEN_CLOSE_TAG
+ close_name = tag_name(token)
+ isempty(tags) && error("Closing tag $close_name> with no matching open tag.")
+ t = pop!(tags)
+ t == close_name || error("Mismatched tags: expected $t>, got $close_name>.")
+ a = pop!(attrs_stack)
+ c = pop!(children_stack)
+ push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, isempty(c) ? nothing : c))
+ in_close_tag = true
+
+ elseif k === TOKEN_ATTR_NAME
+ pending_attr_name = token.raw
+
+ elseif k === TOKEN_ATTR_VALUE
+ val = convert_text(attr_value(token))
+ name = _to(S, pending_attr_name)
+ if decl_attrs !== nothing
+ any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name")
+ push!(decl_attrs, name => val)
+ elseif !isempty(attrs_stack)
+ any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name")
+ push!(last(attrs_stack), name => val)
+ end
+
+ elseif k === TOKEN_XML_DECL_OPEN
+ decl_attrs = Pair{S,S}[]
+
+ elseif k === TOKEN_XML_DECL_CLOSE
+ a = isempty(decl_attrs) ? nothing : decl_attrs
+ push!(last(children_stack), Node{S}(Declaration, nothing, a, nothing, nothing))
+ decl_attrs = nothing
+
+ elseif k === TOKEN_COMMENT_CONTENT
+ push!(last(children_stack), Node{S}(Comment, nothing, nothing, _to(S, token.raw), nothing))
+
+ elseif k === TOKEN_CDATA_CONTENT
+ push!(last(children_stack), Node{S}(CData, nothing, nothing, _to(S, token.raw), nothing))
+
+ elseif k === TOKEN_DOCTYPE_CONTENT
+ push!(last(children_stack), Node{S}(DTD, nothing, nothing, _to(S, lstrip(token.raw)), nothing))
+
+ elseif k === TOKEN_PI_OPEN
+ pending_pi_tag = pi_target(token)
+ pending_pi_value = nothing
+
+ elseif k === TOKEN_PI_CONTENT
+ content = strip(token.raw)
+ pending_pi_value = isempty(content) ? nothing : _to(S, content)
+
+ elseif k === TOKEN_PI_CLOSE
+ push!(last(children_stack), Node{S}(ProcessingInstruction, _to(S, pending_pi_tag), nothing, pending_pi_value, nothing))
+ end
end
+
+ !isempty(tags) && error("Unclosed tags: $(join(tags, ", "))")
+ doc_children = only(children_stack)
+ Node{S}(Document, nothing, nothing, nothing, isempty(doc_children) ? nothing : doc_children)
end
-#-----------------------------------------------------------------------------# !!! common !!!
-# Everything below here is common to all data structures
+#-----------------------------------------------------------------------------# h (HTML/XML element builder)
+"""
+ h(tag, children...; attrs...)
+ h.tag(children...; attrs...)
+Convenience constructor for `Element` nodes.
-#-----------------------------------------------------------------------------# interface fallbacks
-nodetype(o) = o.nodetype
-tag(o) = o.tag
-attributes(o) = o.attributes
-value(o) = o.value
-children(o::T) where {T} = isnothing(o.children) ? () : o.children
+ h("div", "hello"; class="main") # hello
+ h.div("hello"; class="main") # same thing
+"""
+function h(tag::Union{Symbol, AbstractString}, children...; attrs...)
+ t = String(tag)
+ a = Pair{String,String}[String(k) => String(v) for (k, v) in pairs(attrs)]
+ c = Node{String}[_to_node(x) for x in children]
+ Node{String}(Element, t, a, nothing, c)
+end
-depth(o) = missing
-parent(o) = missing
-next(o) = missing
-prev(o) = missing
+Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
-is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) &&
- length(children(o)) == 1 && nodetype(only(o)) in (Text, CData)
+function (o::Node)(args...; attrs...)
+ o.nodetype === Element || error("Only Element nodes are callable.")
+ old_children = something(o.children, ())
+ old_attrs = isnothing(o.attributes) ? () : (Symbol(k) => v for (k, v) in o.attributes)
+ h(o.tag, old_children..., args...; old_attrs..., attrs...)
+end
+
+#-----------------------------------------------------------------------------# DTD parsing
+struct ElementDecl
+ name::String
+ content::String # "EMPTY", "ANY", or content model like "(#PCDATA)" or "(a,b,c)*"
+end
-simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.")
+struct AttDecl
+ element::String
+ name::String
+ type::String # "CDATA", "ID", "(val1|val2)", "NOTATION (a|b)", etc.
+ default::String # "#REQUIRED", "#IMPLIED", "#FIXED \"val\"", or "\"val\""
+end
-Base.@deprecate_binding simplevalue simple_value
+struct EntityDecl
+ name::String
+ value::Union{Nothing, String} # replacement text (internal entities)
+ external_id::Union{Nothing, String} # "SYSTEM \"uri\"" or "PUBLIC \"pubid\" \"uri\""
+ parameter::Bool
+end
-#-----------------------------------------------------------------------------# nodes_equal
-function nodes_equal(a, b)
- out = XML.tag(a) == XML.tag(b)
- out &= XML.nodetype(a) == XML.nodetype(b)
- out &= XML.attributes(a) == XML.attributes(b)
- out &= XML.value(a) == XML.value(b)
- out &= length(XML.children(a)) == length(XML.children(b))
- out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b)))
- return out
+struct NotationDecl
+ name::String
+ external_id::String
end
-Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b)
+struct ParsedDTD
+ root::String
+ system_id::Union{Nothing, String}
+ public_id::Union{Nothing, String}
+ elements::Vector{ElementDecl}
+ attributes::Vector{AttDecl}
+ entities::Vector{EntityDecl}
+ notations::Vector{NotationDecl}
+end
-#-----------------------------------------------------------------------------# parse
-Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T)
+# DTD parsing helpers
+@inline _dtd_is_name_char(c::Char) =
+ ('a' <= c <= 'z') || ('A' <= c <= 'Z') || ('0' <= c <= '9') ||
+ c == '_' || c == '-' || c == '.' || c == ':'
-#-----------------------------------------------------------------------------# indexing
-Base.getindex(o::Union{Raw, AbstractXMLNode}) = o
-Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i]
-Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o)
-Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o))
-
-Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o))
-
-Base.length(o::AbstractXMLNode) = length(children(o))
-
-#-----------------------------------------------------------------------------# printing
-function _show_node(io::IO, o)
- printstyled(io, typeof(o), ' '; color=:light_black)
- !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black)
- printstyled(io, nodetype(o), ; color=:light_green)
- if o.nodetype === Text
- printstyled(io, ' ', repr(value(o)))
- elseif o.nodetype === Element
- printstyled(io, " <", tag(o), color=:light_cyan)
- _print_attrs(io, o; color=:light_yellow)
- printstyled(io, '>', color=:light_cyan)
- _print_n_children(io, o)
- elseif o.nodetype === DTD
- printstyled(io, " ', color=:light_cyan)
- elseif o.nodetype === Declaration
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === ProcessingInstruction
- printstyled(io, " ", tag(o), color=:light_cyan)
- _print_attrs(io, o; color=:light_yellow)
- printstyled(io, "?>", color=:light_cyan)
- elseif o.nodetype === Comment
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === CData
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === Document
- _print_n_children(io, o)
- elseif o.nodetype === UNKNOWN
- printstyled(io, "Unknown", color=:light_cyan)
- _print_n_children(io, o)
- else
- error("Unreachable reached")
+function _dtd_skip_ws(s, pos)
+ while pos <= ncodeunits(s) && isspace(s[pos])
+ pos += 1
end
+ pos
end
-function _print_attrs(io::IO, o; color=:normal)
- attr = attributes(o)
- isnothing(attr) && return nothing
- for (k,v) in attr
- # printstyled(io, ' ', k, '=', '"', v, '"'; color)
- print(io, ' ', k, '=', '"', v, '"')
+function _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ start = pos
+ while pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+ pos += 1
end
+ start == pos && error("Expected name at position $pos in DTD")
+ SubString(s, start, pos - 1), pos
end
-function _print_n_children(io::IO, o::Node)
- n = length(children(o))
- text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)"
- printstyled(io, text, color=:light_black)
-end
-_print_n_children(io::IO, o) = nothing
-
-#-----------------------------------------------------------------------------# write_xml
-write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
-
-write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
-
-function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
- indent = ' ' ^ indentsize
- nodetype = XML.nodetype(x)
- tag = XML.tag(x)
- value = XML.value(x)
- children = XML.children(x)
-
- padding = indent ^ max(0, depth - 1)
- !ctx[end] && print(io, padding)
-
- if nodetype === Text
- print(io, value)
-
- elseif nodetype === Element
- push!(ctx, ctx[end])
- update_ctx!(ctx, x)
- print(io, '<', tag)
- _print_attrs(io, x)
- print(io, isempty(children) ? '/' : "", '>')
- if !isempty(children)
- if length(children) == 1 && XML.nodetype(only(children)) === Text
- write(io, only(children), ctx; indentsize=0)
- print(io, "", tag, '>')
- else
- !ctx[end] && println(io)
- foreach(children) do child
- write(io, child, ctx; indentsize, depth=depth + 1)
- !ctx[end] && println(io)
- end
- print(io, !ctx[end] ? padding : "", "", tag, '>')
+
+function _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ q = s[pos]
+ (q == '"' || q == '\'') || error("Expected quoted string at position $pos in DTD")
+ pos += 1
+ start = pos
+ while pos <= ncodeunits(s) && s[pos] != q
+ pos += 1
+ end
+ val = SubString(s, start, pos - 1)
+ pos += 1
+ val, pos
+end
+
+function _dtd_read_parens(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ s[pos] == '(' || error("Expected '(' at position $pos in DTD")
+ depth = 1
+ start = pos
+ pos += 1
+ while pos <= ncodeunits(s) && depth > 0
+ c = s[pos]
+ if c == '('
+ depth += 1
+ elseif c == ')'
+ depth -= 1
+ elseif c == '"' || c == '\''
+ pos += 1
+ while pos <= ncodeunits(s) && s[pos] != c
+ pos += 1
end
end
- pop!(ctx)
+ pos += 1
+ end
+ SubString(s, start, pos - 1), pos
+end
- elseif nodetype === DTD
- print(io, "')
+function _dtd_skip_to_close(s, pos)
+ while pos <= ncodeunits(s) && s[pos] != '>'
+ c = s[pos]
+ if c == '"' || c == '\''
+ pos += 1
+ while pos <= ncodeunits(s) && s[pos] != c
+ pos += 1
+ end
+ end
+ pos += 1
+ end
+ pos <= ncodeunits(s) ? pos + 1 : pos
+end
- elseif nodetype === Declaration
- print(io, "")
+function _dtd_parse_element(s, pos)
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if s[pos] == '('
+ content, pos = _dtd_read_parens(s, pos)
+ if pos <= ncodeunits(s) && s[pos] in ('*', '+', '?')
+ content = string(content, s[pos])
+ pos += 1
+ end
+ else
+ content, pos = _dtd_read_name(s, pos)
+ end
+ pos = _dtd_skip_to_close(s, pos)
+ ElementDecl(String(name), String(content)), pos
+end
- elseif nodetype === ProcessingInstruction
- print(io, "", tag)
- _print_attrs(io, x)
- print(io, "?>")
+function _dtd_parse_attlist(s, pos)
+ element, pos = _dtd_read_name(s, pos)
+ atts = AttDecl[]
+ while true
+ pos = _dtd_skip_ws(s, pos)
+ (pos > ncodeunits(s) || s[pos] == '>') && break
+
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+
+ # Attribute type
+ if s[pos] == '('
+ atype, pos = _dtd_read_parens(s, pos)
+ else
+ atype, pos = _dtd_read_name(s, pos)
+ if atype == "NOTATION"
+ pos = _dtd_skip_ws(s, pos)
+ parens, pos = _dtd_read_parens(s, pos)
+ atype = string("NOTATION ", parens)
+ end
+ end
+ pos = _dtd_skip_ws(s, pos)
- elseif nodetype === Comment
- print(io, "")
+ # Default declaration
+ if s[pos] == '#'
+ pos += 1
+ keyword, pos = _dtd_read_name(s, pos)
+ if keyword == "FIXED"
+ pos = _dtd_skip_ws(s, pos)
+ val, pos = _dtd_read_quoted(s, pos)
+ default = string("#FIXED \"", val, "\"")
+ else
+ default = string("#", keyword)
+ end
+ elseif s[pos] == '"' || s[pos] == '\''
+ val, pos = _dtd_read_quoted(s, pos)
+ default = string("\"", val, "\"")
+ else
+ error("Expected default declaration at position $pos in DTD")
+ end
+ push!(atts, AttDecl(String(element), String(name), String(atype), default))
+ end
+ pos <= ncodeunits(s) && s[pos] == '>' && (pos += 1)
+ atts, pos
+end
- elseif nodetype === CData
- print(io, "")
+function _dtd_parse_entity(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ parameter = false
+ if pos <= ncodeunits(s) && s[pos] == '%'
+ parameter = true
+ pos += 1
+ end
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
- elseif nodetype === Document
- foreach(children) do child
- write(io, child, ctx; indentsize)
- !ctx[end] && println(io)
+ value = nothing
+ external_id = nothing
+ if s[pos] == '"' || s[pos] == '\''
+ v, pos = _dtd_read_quoted(s, pos)
+ value = String(v)
+ else
+ keyword, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if keyword == "SYSTEM"
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("SYSTEM \"", uri, "\"")
+ elseif keyword == "PUBLIC"
+ pubid, pos = _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+ else
+ error("Expected SYSTEM, PUBLIC, or quoted value in ENTITY declaration")
end
+ end
+ pos = _dtd_skip_to_close(s, pos)
+ EntityDecl(String(name), value, external_id, parameter), pos
+end
+function _dtd_parse_notation(s, pos)
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ keyword, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if keyword == "SYSTEM"
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("SYSTEM \"", uri, "\"")
+ elseif keyword == "PUBLIC"
+ pubid, pos = _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+ else
+ external_id = string("PUBLIC \"", pubid, "\"")
+ end
else
- error("Unreachable case reached during XML.write")
+ error("Expected SYSTEM or PUBLIC in NOTATION declaration")
+ end
+ pos = _dtd_skip_to_close(s, pos)
+ NotationDecl(String(name), external_id), pos
+end
+
+"""
+ parse_dtd(value::AbstractString) -> ParsedDTD
+ parse_dtd(node::Node) -> ParsedDTD
+
+Parse a DTD value string (from a `DTD` node) into structured declarations.
+"""
+function parse_dtd(value::AbstractString)
+ s = String(value)
+ pos = 1
+
+ root, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+
+ # External ID
+ system_id = nothing
+ public_id = nothing
+ if pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+ keyword, kpos = _dtd_read_name(s, pos)
+ if keyword == "SYSTEM"
+ pos = kpos
+ uri, pos = _dtd_read_quoted(s, pos)
+ system_id = String(uri)
+ elseif keyword == "PUBLIC"
+ pos = kpos
+ pubid, pos = _dtd_read_quoted(s, pos)
+ public_id = String(pubid)
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+ uri, pos = _dtd_read_quoted(s, pos)
+ system_id = String(uri)
+ end
+ end
+ end
+
+ elements = ElementDecl[]
+ attributes = AttDecl[]
+ entities = EntityDecl[]
+ notations = NotationDecl[]
+
+ # Internal subset
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && s[pos] == '['
+ pos += 1
+ while pos <= ncodeunits(s)
+ pos = _dtd_skip_ws(s, pos)
+ pos > ncodeunits(s) && break
+ s[pos] == ']' && break
+
+ rest = SubString(s, pos)
+ if startswith(rest, "", s, pos + 4)
+ isnothing(i) && error("Unterminated comment in DTD")
+ pos = last(i) + 1
+ elseif startswith(rest, "")
+ i = findnext("?>", s, pos + 2)
+ isnothing(i) && error("Unterminated PI in DTD")
+ pos = last(i) + 1
+ elseif startswith(rest, "text \"\"\")
+ println(token)
+end
+```
+"""
+module XMLTokenizer
+
+
+#-----------------------------------------------------------------------# TokenKind
+@enum TokenKind::UInt8 begin
+ # Character data
+ TOKEN_TEXT # text content between markup
+
+ # Element tags
+ TOKEN_OPEN_TAG #
+ TOKEN_SELF_CLOSE # />
+ TOKEN_ATTR_NAME # attribute name
+ TOKEN_ATTR_VALUE # "value" or 'value' (with quotes in raw)
+
+ # CDATA sections
+ TOKEN_CDATA_OPEN #
+
+ # Comments
+ TOKEN_COMMENT_OPEN #
+
+ # Processing instructions
+ TOKEN_PI_OPEN #
+
+ # XML declaration ()
+ TOKEN_XML_DECL_OPEN #
+ # (reuses TOKEN_ATTR_NAME / TOKEN_ATTR_VALUE for pseudo-attributes)
+
+ # DOCTYPE
+ TOKEN_DOCTYPE_OPEN #
+end
+
+#-----------------------------------------------------------------------# Token
+struct Token{S <: AbstractString}
+ kind::TokenKind
+ raw::SubString{S}
+end
+
+function Base.show(io::IO, t::Token)
+ print(io, t.kind, ": ", repr(String(t.raw)))
+end
+
+#-----------------------------------------------------------------------# Tokenizer mode
+@enum _Mode::UInt8 begin
+ _M_DEFAULT # normal content mode
+ _M_TAG # inside open tag, reading attributes
+ _M_TAG_VALUE # expecting quoted attribute value
+ _M_CLOSE_TAG # inside close tag, expecting >
+ _M_XML_DECL # inside Tokenizer
+
+Return a lazy iterator of `Token`s over the XML string `xml`.
+"""
+struct Tokenizer{S <: AbstractString}
+ data::S
+ start::Int
+end
+
+tokenize(xml::AbstractString) = Tokenizer(xml, 1)
+tokenize(xml::AbstractString, pos::Int) = Iterators.Stateful(Tokenizer(xml, pos))
+
+function Base.show(io::IO, t::Tokenizer)
+ n = ncodeunits(t.data)
+ print(io, "Tokenizer(")
+ t.start > 1 && print(io, t.start, "/")
+ print(io, Base.format_bytes(n), ")")
+end
+
+Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S}
+
+function Base.iterate(t::Tokenizer, st::TokenizerState=_init_state(t.data, t.start))
+ result = _next_token(t.data, st)
+ result === nothing ? nothing : result
+end
+
+#-----------------------------------------------------------------------# Internal helpers
+@inline _iseof(data, pos) = pos > ncodeunits(data)
+@inline _peek(data, pos) = @inbounds codeunit(data, pos)
+@inline _canpeek(data, pos, offset) = pos + offset <= ncodeunits(data)
+
+@inline function _is_name_byte(b::UInt8)
+ (UInt8('a') <= b <= UInt8('z')) || (UInt8('A') <= b <= UInt8('Z')) ||
+ (UInt8('0') <= b <= UInt8('9')) || b == UInt8('_') || b == UInt8('-') ||
+ b == UInt8('.') || b == UInt8(':')
+end
+
+@inline function _is_whitespace(b::UInt8)
+ b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r')
+end
+
+@inline function _skip_whitespace(data, pos)
+ @inbounds while !_iseof(data, pos) && _is_whitespace(_peek(data, pos))
+ pos += 1
+ end
+ pos
+end
+
+function _skip_quoted(data, pos)
+ q = @inbounds _peek(data, pos)
+ pos += 1
+ @inbounds while !_iseof(data, pos)
+ _peek(data, pos) == q && return pos + 1
+ pos += 1
+ end
+ error("Unterminated quoted string")
+end
+
+@noinline _err(msg, pos) = throw(ArgumentError("XML tokenizer error at position $pos: $msg"))
+
+#-----------------------------------------------------------------------# Main dispatch
+function _next_token(data, st::TokenizerState)
+ if _has_pending(st)
+ return (st.pending, TokenizerState(st.pos, st.mode, _no_token(data)))
+ end
+ pos = st.pos
+ _iseof(data, pos) && return nothing
+
+ s = st.mode
+ if s == _M_DEFAULT
+ _peek(data, pos) == UInt8('<') ? _read_markup(data, pos) : _read_text(data, pos)
+ elseif s == _M_TAG || s == _M_XML_DECL
+ _read_in_tag(data, pos, s)
+ elseif s == _M_TAG_VALUE || s == _M_XML_DECL_VALUE
+ _read_attr_value(data, pos, s)
+ elseif s == _M_CLOSE_TAG
+ _read_close_tag_end(data, pos)
+ elseif s == _M_COMMENT
+ _read_comment_body(data, pos)
+ elseif s == _M_CDATA
+ _read_cdata_body(data, pos)
+ elseif s == _M_PI
+ _read_pi_body(data, pos)
+ else # _M_DOCTYPE
+ _read_doctype_body(data, pos)
+ end
+end
+
+#-----------------------------------------------------------------------# S_DEFAULT tokens
+function _read_text(data, pos)
+ start = pos
+ @inbounds while !_iseof(data, pos) && _peek(data, pos) != UInt8('<')
+ pos += 1
+ end
+ tok = Token(TOKEN_TEXT, @inbounds SubString(data, start, prevind(data, pos)))
+ (tok, TokenizerState(pos, _M_DEFAULT, _no_token(data)))
+end
+
+function _read_markup(data, pos)
+ start = pos
+ pos += 1 # skip '<'
+ _iseof(data, pos) && _err("unexpected end of input after '<'", start)
+
+ b = _peek(data, pos)
+ if b == UInt8('!')
+ _read_bang(data, pos + 1, start)
+ elseif b == UInt8('?')
+ _read_pi_start(data, pos + 1, start)
+ elseif b == UInt8('/')
+ _read_close_tag_start(data, pos + 1, start)
+ else
+ _read_open_tag_start(data, pos, start)
+ end
+end
+
+#-----------------------------------------------------------------------# or other ') || _err("expected '>'", pos)
+ tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
+ (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL)
+function _read_in_tag(data, pos, mode)
+ pos = _skip_whitespace(data, pos)
+ _iseof(data, pos) && _err("unterminated tag", pos)
+
+ b = _peek(data, pos)
+ is_decl = (mode == _M_XML_DECL)
+
+ # Check for end delimiters
+ if is_decl
+ if b == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+ tok = Token(TOKEN_XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1))
+ return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data)))
+ end
+ else
+ if b == UInt8('>')
+ tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos))
+ return (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data)))
+ end
+ if b == UInt8('/') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+ tok = Token(TOKEN_SELF_CLOSE, @inbounds SubString(data, pos, pos + 1))
+ return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data)))
+ end
+ end
+
+ # Attribute name
+ name_start = pos
+ @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos))
+ pos += 1
+ end
+ name_end = pos - 1
+ name_start > name_end && _err("expected attribute name or tag close", pos)
+
+ # Consume '=' and surrounding whitespace (not part of any token)
+ pos = _skip_whitespace(data, pos)
+ (!_iseof(data, pos) && _peek(data, pos) == UInt8('=')) || _err("expected '=' after attribute name", pos)
+ pos += 1
+ pos = _skip_whitespace(data, pos)
+
+ next_state = is_decl ? _M_XML_DECL_VALUE : _M_TAG_VALUE
+ tok = Token(TOKEN_ATTR_NAME, @inbounds SubString(data, name_start, name_end))
+ (tok, TokenizerState(pos, next_state, _no_token(data)))
+end
+
+function _read_attr_value(data, pos, mode)
+ _iseof(data, pos) && _err("expected attribute value", pos)
+
+ q = _peek(data, pos)
+ (q == UInt8('"') || q == UInt8('\'')) || _err("expected quoted attribute value", pos)
+
+ start = pos
+ pos += 1 # skip opening quote
+ @inbounds while !_iseof(data, pos) && _peek(data, pos) != q
+ pos += 1
+ end
+ _iseof(data, pos) && _err("unterminated attribute value", start)
+ pos += 1 # skip closing quote
+
+ next_state = (mode == _M_XML_DECL_VALUE) ? _M_XML_DECL : _M_TAG
+ tok = Token(TOKEN_ATTR_VALUE, @inbounds SubString(data, start, pos - 1))
+ (tok, TokenizerState(pos, next_state, _no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
+function _read_comment_body(data, pos)
+ start = pos
+ @inbounds while !_iseof(data, pos)
+ if _peek(data, pos) == UInt8('-') &&
+ _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+ _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 3
+ pending = Token(TOKEN_COMMENT_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TOKEN_COMMENT_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, _M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ _err("unterminated comment", start)
+end
+
+function _read_cdata_body(data, pos)
+ start = pos
+ @inbounds while !_iseof(data, pos)
+ if _peek(data, pos) == UInt8(']') &&
+ _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8(']') &&
+ _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 3
+ pending = Token(TOKEN_CDATA_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TOKEN_CDATA_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, _M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ _err("unterminated CDATA section", start)
+end
+
+function _read_pi_body(data, pos)
+ start = pos
+ @inbounds while !_iseof(data, pos)
+ if _peek(data, pos) == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 2
+ pending = Token(TOKEN_PI_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TOKEN_PI_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, _M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ _err("unterminated processing instruction", start)
+end
+
+function _read_doctype_body(data, pos)
+ start = pos
+ depth = 0
+ @inbounds while !_iseof(data, pos)
+ b = _peek(data, pos)
+ if b == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+ pos >= 2 &&
+ codeunit(data, pos - 1) == UInt8('!') &&
+ codeunit(data, pos - 2) == UInt8('<')
+ # Inside a
+ pos += 2 # skip "--"
+ while !_iseof(data, pos)
+ if _peek(data, pos) == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') &&
+ _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>')
+ pos += 3 # skip "-->"
+ break
+ end
+ pos += 1
+ end
+ elseif b == UInt8('"') || b == UInt8('\'')
+ pos = _skip_quoted(data, pos)
+ elseif b == UInt8('[')
+ depth += 1
+ pos += 1
+ elseif b == UInt8(']')
+ depth -= 1
+ pos += 1
+ elseif b == UInt8('>') && depth == 0
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 1
+ pending = Token(TOKEN_DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+ tok = Token(TOKEN_DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, _M_DEFAULT, pending))
+ else
+ pos += 1
+ end
+ end
+ _err("unterminated DOCTYPE", start)
+end
+
+#-----------------------------------------------------------------------# Utility functions
+
+"""
+ tag_name(token::Token) -> SubString{String}
+
+Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token.
+"""
+function tag_name(token::Token)
+ if token.kind == TOKEN_OPEN_TAG
+ @inbounds SubString(token.raw, 2, ncodeunits(token.raw)) # skip '<'
+ elseif token.kind == TOKEN_CLOSE_TAG
+ @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip ''
+ else
+ throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)"))
+ end
+end
+
+"""
+ attr_value(token::Token) -> SubString{String}
+
+Strip the surrounding quotes from an `ATTR_VALUE` token.
+"""
+function attr_value(token::Token)
+ token.kind == TOKEN_ATTR_VALUE ||
+ throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)"))
+ @inbounds SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw)))
+end
+
+"""
+ pi_target(token::Token) -> SubString{String}
+
+Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token.
+"""
+function pi_target(token::Token)
+ (token.kind == TOKEN_PI_OPEN || token.kind == TOKEN_XML_DECL_OPEN) ||
+ throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)"))
+ @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip ''
+end
+
+end # module XMLTokenizer
diff --git a/src/dtd.jl b/src/dtd.jl
deleted file mode 100644
index 58299f0..0000000
--- a/src/dtd.jl
+++ /dev/null
@@ -1,141 +0,0 @@
-# This is all a work in progress
-
-#-----------------------------------------------------------------------------# position_after
-function position_after(needle::Vector{UInt8}, haystack::Vector{UInt8}, i)
- x = findnext(needle, haystack, i)
- isnothing(x) ? nothing : x[end] + 1
-end
-
-position_after(needle::String, haystack::Vector{UInt8}, i) = position_after(Vector{UInt8}(needle), haystack, i)
-
-
-#-----------------------------------------------------------------------------# DeclaredElement
-struct DeclaredElement
- name::String
- content::String # "ANY", "EMPTY", or "(children...)"
- function DeclaredElement(name, content)
- content in ("ANY", "EMPTY") || (content[1] == '(' && content[end] == ')') ||
- error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'. Got $content.")
- new(name, content)
- end
-end
-Base.show(io::IO, o::DeclaredElement) = print(io, "")
-
-function get_declared_elements(data::Vector{UInt8})
- i = position_after("")
-
-
-function get_declared_attributes(data)
- i = position_after("")
-end
-
-function get_declared_entities(data)
- i = position_after(" println(io, " ", x), o.elements)
- printstyled(io, " DeclaredAttributes (", length(o.attributes), ")\n", color=:light_green)
- foreach(x -> println(io, " ", x), o.attributes)
- printstyled(io, " DeclaredEntities (", length(o.entities), ")\n", color=:light_green)
- foreach(x -> println(io, " ", x), o.entities)
-end
-
-
-function DTDBody(data::Vector{UInt8}, file = false)
- file && @goto isfile
- i = position_after(" unescape(attr_value(result[1])))
+ end
+ isempty(attrs) ? nothing : Attributes(attrs)
+end
+
+function Base.get(n::LazyNode, key::AbstractString, default)
+ n.nodetype in (Element, Declaration) || return default
+ iter = _lazy_tokenizer(n)
+ iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN
+ for tok in iter
+ tok.kind === TOKEN_ATTR_NAME || return default
+ if tok.raw == key
+ result = iterate(iter)
+ result === nothing && return default
+ return unescape(attr_value(result[1]))
+ else
+ iterate(iter) # skip value
+ end
+ end
+ default
+end
+
+function Base.getindex(n::LazyNode, key::AbstractString)
+ val = get(n, key, _MISSING_ATTR)
+ val === _MISSING_ATTR && throw(KeyError(key))
+ val
+end
+
+function Base.haskey(n::LazyNode, key::AbstractString)
+ get(n, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+function Base.keys(n::LazyNode)
+ n.nodetype in (Element, Declaration) || return ()
+ iter = _lazy_tokenizer(n)
+ iterate(iter)
+ result = String[]
+ for tok in iter
+ tok.kind === TOKEN_ATTR_NAME || break
+ push!(result, String(tok.raw))
+ iterate(iter) # skip value
+ end
+ result
+end
+
+#-----------------------------------------------------------------------------# children
+function children(n::LazyNode{S}) where {S}
+ nt = n.nodetype
+ if nt === Document
+ return _lazy_collect_children(n.data, _lazy_tokenizer(n))
+ elseif nt !== Element
+ return ()
+ end
+ iter = _lazy_tokenizer(n)
+ for tok in iter
+ tok.kind === TOKEN_SELF_CLOSE && return LazyNode{S}[]
+ tok.kind === TOKEN_TAG_CLOSE && break
+ end
+ _lazy_collect_children(n.data, iter)
+end
+
+function _lazy_collect_children(data::S, iter) where {S <: AbstractString}
+ result = LazyNode{S}[]
+ for tok in iter
+ k = tok.kind
+ if k === TOKEN_TEXT
+ push!(result, LazyNode(data, tok, Text))
+ elseif k === TOKEN_OPEN_TAG
+ push!(result, LazyNode(data, tok, Element))
+ _lazy_skip_element!(iter)
+ elseif k === TOKEN_COMMENT_OPEN
+ push!(result, LazyNode(data, tok, Comment))
+ _lazy_skip_until!(iter, TOKEN_COMMENT_CLOSE)
+ elseif k === TOKEN_CDATA_OPEN
+ push!(result, LazyNode(data, tok, CData))
+ _lazy_skip_until!(iter, TOKEN_CDATA_CLOSE)
+ elseif k === TOKEN_PI_OPEN
+ push!(result, LazyNode(data, tok, ProcessingInstruction))
+ _lazy_skip_until!(iter, TOKEN_PI_CLOSE)
+ elseif k === TOKEN_XML_DECL_OPEN
+ push!(result, LazyNode(data, tok, Declaration))
+ _lazy_skip_until!(iter, TOKEN_XML_DECL_CLOSE)
+ elseif k === TOKEN_DOCTYPE_OPEN
+ push!(result, LazyNode(data, tok, DTD))
+ _lazy_skip_until!(iter, TOKEN_DOCTYPE_CLOSE)
+ elseif k === TOKEN_CLOSE_TAG
+ break
+ end
+ end
+ result
+end
+
+function _lazy_skip_element!(iter)
+ depth = 1
+ for tok in iter
+ k = tok.kind
+ if k === TOKEN_OPEN_TAG
+ depth += 1
+ elseif k === TOKEN_SELF_CLOSE
+ depth -= 1
+ depth == 0 && return
+ elseif k === TOKEN_CLOSE_TAG
+ depth -= 1
+ if depth == 0
+ iterate(iter) # consume trailing TAG_CLOSE
+ return
+ end
+ end
+ end
+end
+
+function _lazy_skip_until!(iter, target::TokenKind)
+ for tok in iter
+ tok.kind === target && return
+ end
+end
+
+#-----------------------------------------------------------------------------# is_simple / simple_value
+function is_simple(n::LazyNode)
+ n.nodetype === Element || return false
+ attrs = attributes(n)
+ (!isnothing(attrs) && !isempty(attrs)) && return false
+ ch = children(n)
+ length(ch) == 1 && ch[1].nodetype in (Text, CData)
+end
+
+function simple_value(n::LazyNode)
+ n.nodetype === Element || error("`simple_value` is only defined for simple nodes.")
+ attrs = attributes(n)
+ (!isnothing(attrs) && !isempty(attrs)) && error("`simple_value` is only defined for simple nodes.")
+ ch = children(n)
+ length(ch) == 1 && ch[1].nodetype in (Text, CData) || error("`simple_value` is only defined for simple nodes.")
+ value(ch[1])
+end
+
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(n::LazyNode, i::Integer) = children(n)[i]
+Base.getindex(n::LazyNode, ::Colon) = children(n)
+Base.lastindex(n::LazyNode) = lastindex(children(n))
+Base.only(n::LazyNode) = only(children(n))
+Base.length(n::LazyNode) = length(children(n))
+
+#-----------------------------------------------------------------------------# parse / read
+Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode)
+Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), Document)
+
+Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode)
+Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode)
+
+#-----------------------------------------------------------------------------# show
+function Base.show(io::IO, n::LazyNode)
+ nt = n.nodetype
+ print(io, "Lazy ", nt)
+ if nt === Text
+ print(io, ' ', repr(value(n)))
+ elseif nt === Element
+ print(io, " <", tag(n))
+ attrs = attributes(n)
+ if !isnothing(attrs)
+ for (k, v) in attrs
+ print(io, ' ', k, '=', '"', v, '"')
+ end
+ end
+ print(io, '>')
+ elseif nt === DTD
+ print(io, " ')
+ elseif nt === Declaration
+ print(io, " ")
+ elseif nt === ProcessingInstruction
+ print(io, " ", tag(n))
+ v = value(n)
+ !isnothing(v) && print(io, ' ', v)
+ print(io, "?>")
+ elseif nt === Comment
+ print(io, " ")
+ elseif nt === CData
+ print(io, " ")
+ elseif nt === Document
+ n_ch = length(children(n))
+ n_ch > 0 && print(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)")
+ end
+end
diff --git a/src/raw.jl b/src/raw.jl
deleted file mode 100644
index 29d0a10..0000000
--- a/src/raw.jl
+++ /dev/null
@@ -1,568 +0,0 @@
-#-----------------------------------------------------------------------------# RawType
-"""
- RawType:
- - RawText # text
- - RawComment #
- - RawCData #
- - RawDeclaration #
- - RawProcessingInstruction #
- - RawDTD #
- - RawElementOpen #
- - RawElementClose #
- - RawElementSelfClosed #
- - RawDocument # Something to initialize with (not really used)
-"""
-@enum(RawType, RawDocument, RawText, RawComment, RawCData, RawProcessingInstruction,
- RawDeclaration, RawDTD, RawElementOpen, RawElementClose, RawElementSelfClosed)
-
-@inline nodetype(x::RawType) =
- x === RawElementOpen ? Element :
- x === RawElementClose ? Element :
- x === RawElementSelfClosed ? Element :
- x === RawText ? Text :
- x === RawComment ? Comment :
- x === RawCData ? CData :
- x === RawDeclaration ? Declaration :
- x === RawDTD ? DTD :
- x === RawProcessingInstruction ? ProcessingInstruction :
- x === RawDocument ? Document :
- nothing
-
-#-----------------------------------------------------------------------------# Raw
-"""
- Raw(filename::String)
-
-Create an iterator over raw chunks of data in an XML file. Each chunk of data represents one of:
-
- - RawDocument # Only used to initialize the iterator state.
- - RawText # text
- - RawComment #
- - RawCData #
- - RawDeclaration #
- - RawProcessingInstruction #
- - RawDTD #
- - RawElementOpen #
- - RawElementClose #
- - RawElementSelfClosed #
-
-Useful functions:
-
- - view(o::Raw) --> view of the Vector{UInt8} chunk.
- - String(o::Raw) --> String of the chunk.
- - next(o::Raw) --> Raw of the next chunk (or `nothing`).
- - prev(o::Raw) --> Raw of the previous chunk (or `nothing`).
- - tag(o::Raw) --> String of the tag name (or `nothing`).
- - attributes(o::Raw) --> OrderedDict{String, String} of the attributes (or `nothing`).
- - value(o::Raw) --> String of the value (or `nothing`).
- - children(o::Raw) --> Vector{Raw} of the children (or `nothing`).
- - parent(o::Raw) --> Raw of the parent (or `nothing`)
- - depth(o::Raw) --> Int of the depth of the node in the XML DOM.
-"""
-struct Raw
- type::RawType
- depth::Int
- pos::Int
- len::Int
- data::Vector{UInt8}
- ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context)
- has_xml_space::Bool # Whether data contains `xml:space` attribute at least once
-end
-function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
- needle = Vector{UInt8}("xml:space")
- has_xml_space = findfirst(needle, data) !== nothing
- return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
-end
-function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false])
- return Raw(RawDocument, 0, 0, 0, data, ctx, has_xml_space)
-end
-
-const _RAW_INDEX = WeakKeyDict{Vector{UInt8}, Any}()
-
-struct _TokRec
- type::RawType
- depth::Int
- pos::Int
- len::Int
- ctx::Vector{Bool}
-end
-
-mutable struct _Index
- recs::Vector{_TokRec}
- last_raw::Raw
- built_end::Int
-end
-
-Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
- Raw(Mmap.mmap(filename)) :
- error("File \"$filename\" does not exist.")
-
-Base.read(io::IO, ::Type{Raw}) = Raw(read(io))
-
-Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x))
-
-# Mostly for debugging
-Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)]))
-
-function Base.show(io::IO, o::Raw)
- print(io, o.type, ':', o.depth, " (pos=", o.pos, ", len=", o.len, ")")
- o.len > 0 && printstyled(io, ": ", String(o); color=:light_green)
-end
-function Base.:(==)(a::Raw, b::Raw)
- a.type == b.type && a.depth == b.depth && a.pos == b.pos && a.len == b.len && a.data === b.data && a.ctx == b.ctx && a.has_xml_space == b.has_xml_space
-end
-
-Base.view(o::Raw) = view(o.data, o.pos:o.pos+o.len)
-Base.String(o::Raw) = String(view(o))
-
-Base.IteratorSize(::Type{Raw}) = Base.SizeUnknown()
-Base.eltype(::Type{Raw}) = Raw
-
-function Base.iterate(o::Raw, state=o)
- n = next(state)
- return isnothing(n) ? nothing : (n, n)
-end
-
-is_node(o::Raw) = o.type !== RawElementClose
-xml_nodes(o::Raw) = Iterators.Filter(is_node, o)
-
-#-----------------------------------------------------------------------------# get_name
-is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_')
-is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':')
-
-name_start(data, i) = findnext(is_name_start_char, data, i)
-name_stop(data, i) = findnext(!is_name_char, data, i) - 1
-
-function get_name(data, i)
- i = name_start(data, i)
- j = name_stop(data, i)
- @views String(data[i:j]), j + 1
-end
-
-#-----------------------------------------------------------------------------# get_attributes
-# starting at position i, return attributes up until the next '>' or '?' (DTD)
-function get_attributes(data, i, j)
- i = name_start(data, i)
- (isnothing(j) || isnothing(i) || i > j) && return nothing
- out = OrderedDict{String,String}()
- while !isnothing(i) && i < j
- key, i = get_name(data, i)
- # get quotechar the value is wrapped in (either ' or ")
- i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1)
- quotechar = data[i]
- i2 = findnext(==(quotechar), data, i + 1)
- @views value = String(data[i+1:i2-1])
- out[key] = value
- i = name_start(data, i2)
- end
- return out
-end
-
-# ----------------------------------------------------------------------------# Utilities supporting prev
-function _get_or_init_index(o::Raw)
- idx = get(_RAW_INDEX, o.data, nothing)
- if idx === nothing
- start = Raw(o.data) # fresh RawDocument
- _RAW_INDEX[o.data] = _Index(_TokRec[], start, 0)
- idx = _RAW_INDEX[o.data]
- end
- return idx
-end
-function _ensure_index_upto!(o::Raw, target_pos::Int)
- idx = _get_or_init_index(o)
- r = idx.last_raw
- while true
- n = next(r)
- if n === nothing
- idx.built_end = typemax(Int)
- idx.last_raw = r
- return idx
- end
- push!(idx.recs, _TokRec(n.type, n.depth, n.pos, n.len, copy(n.ctx)))
- endpos = n.pos + n.len
- idx.built_end = endpos
- idx.last_raw = n
- r = n
- if endpos >= target_pos
- return idx
- end
- end
-end
-function _find_prev_token(recs::Vector{_TokRec}, p::Int)
- lo, hi = 1, length(recs)
- ans = 0
- while lo <= hi
- mid = (lo + hi) >>> 1
- endpos = recs[mid].pos + recs[mid].len
- if endpos < p + 1
- ans = mid
- lo = mid + 1
- else
- hi = mid - 1
- end
- end
- return ans == 0 ? nothing : recs[ans]
-end
-
-#-----------------------------------------------------------------------------# update xml:space context
-# check attributes for xml:space and update ctx if necessary
-function get_ctx(o)
- att = attributes(o)
- if !isnothing(att) && haskey(att, "xml:space")
- if att["xml:space"] == "preserve"
- return true
- elseif att["xml:space"] == "default"
- return false
- else
- error("Invalid value for xml:space attribute: $(att["xml:space"]). Must be 'preserve' or 'default'.")
- end
- end
- return nothing
-end
-function update_ctx!(ctx, o)
- new_ctx = get_ctx(o)
- if new_ctx !== nothing
- ctx[end] = new_ctx
- end
- return nothing
-end
-
-#-----------------------------------------------------------------------------# interface
-"""
- nodetype(node) --> XML.NodeType
-
-Return the `XML.NodeType` of the node.
-"""
-nodetype(o::Raw) = nodetype(o.type)
-
-"""
- tag(node) --> String or Nothing
-
-Return the tag name of `Element` and `PROCESSING_INSTRUCTION` nodes.
-"""
-function tag(o::Raw)
- o.type ∉ [RawElementOpen, RawElementClose, RawElementSelfClosed, RawProcessingInstruction] && return nothing
- return get_name(o.data, o.pos + 1)[1]
-end
-
-"""
- attributes(node) --> OrderedDict{String, String} or Nothing
-
-Return the attributes of `Element`, `Declaration`, or `ProcessingInstruction` nodes.
-"""
-function attributes(o::Raw)
- if o.type === RawElementOpen || o.type === RawElementSelfClosed || o.type === RawProcessingInstruction
- i = o.pos
- i = name_start(o.data, i)
- i = name_stop(o.data, i)
- get_attributes(o.data, i + 1, o.pos + o.len)
- elseif o.type === RawDeclaration
- get_attributes(o.data, o.pos + 6, o.pos + o.len)
- else
- nothing
- end
-end
-
-"""
- value(node) --> String or Nothing
-
-Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes.
-"""
-function value(o::Raw)
- if o.type === RawText
- String(o)
- elseif o.type === RawCData
- String(view(o.data, o.pos+length(" Vector{typeof(node)}
-
-Return the children the node. Will only be nonempty for `Element` and `Document` nodes.
-"""
-function children(o::Raw)
- if o.type === RawElementOpen || o.type === RawDocument
- depth = o.depth
- out = Raw[]
- for item in xml_nodes(o)
- if item.depth == depth + 1
- push!(out, item)
- end
- item.depth == depth && break
- o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
- end
- out
- else
- Raw[]
- end
-end
-
-"""
- depth(node) --> Int
-
-Return the depth of the node. Will be `0` for `Document` nodes. Not defined for `XML.Node`.
-"""
-function depth(o::Raw)
- o.depth
-end
-
-"""
- parent(node) --> typeof(node), Nothing
-
-Return the parent of the node. Will be `nothing` for `Document` nodes. Not defined for `XML.Node`.
-"""
-function parent(o::Raw)
- depth = o.depth
- depth === 0 && return nothing
- p = prev(o)
- while p.depth >= depth
- p = prev(p)
- end
- return p
-end
-
-#-----------------------------------------------------------------------------# next Raw
-# isspace(x::UInt8) = Base.isspace(Char(x))
-
-# XML whitespace per XML 1.0/1.1 production S:
-# S ::= (#x20 | #x9 | #xD | #xA)+
-@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D)
-
-"""
- next(node) --> typeof(node) or Nothing
-
-Return the next node in the document during depth-first traversal. Depth-first is the order you
-would visit nodes by reading top-down through an XML file. Not defined for `XML.Node`.
-"""
-function next(o::Raw)
- if o.has_xml_space # using xml:space context at least once in data
- return next_xml_space(o)
- else # not using xml:space context at all (same as v0.3.5)
- return next_no_xml_space(o)
- end
-end
-
-function next_xml_space(o::Raw)
- i = o.pos + o.len + 1
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = copy(o.ctx)
- last_type = type
- k = findnext(!xml_isspace, data, i)
- if isnothing(k)
- return nothing
- end
- if last_type === RawElementOpen || last_type === RawDocument
- depth += 1
- push!(ctx, ctx[end]) # inherit the xml:space context from parent
- last_type === RawElementOpen && update_ctx!(ctx, o) # check attributes for xml:space and update if necessary
- end
- i = ctx[end] ? i : k
- b = i > 1 ? Char(o.data[i-1]) : Char('<')
- c = Char(o.data[i])
- d = Char(o.data[k+1])
- if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/'
- type = RawText
- j = findnext(==(UInt8('<')), data, i) - 1
- j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed
- if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument
- # Maybe drop pure-whitespace inter-element text nodes?
- # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node)
- #if all(xml_isspace, @view data[i:j]) && depth > 1
- # return next(Raw(type, depth, j, 0, data, ctx, has_xml_space))
- #end
- end
- else
- i = k
- j = k + 1
- if c === '<'
- c2 = Char(o.data[i+1])
- if c2 === '!'
- c3 = Char(o.data[i+2])
- if c3 === '-'
- type = RawComment
- j = findnext(Vector{UInt8}("-->"), data, i)[end]
- elseif c3 === '['
- type = RawCData
- j = findnext(Vector{UInt8}("]]>"), data, i)[end]
- elseif c3 === 'D' || c3 == 'd'
- type = RawDTD
- j = findnext(==(UInt8('>')), data, i)
- while sum(==(UInt8('>')), @view data[k:j]) != sum(==(UInt8('<')), @view data[i:j])
- j = findnext(==(UInt8('>')), data, j + 1)
- end
- end
- elseif c2 === '?'
- if get_name(data, i + 2)[1] == "xml"
- type = RawDeclaration
- else
- type = RawProcessingInstruction
- end
- j = findnext(Vector{UInt8}("?>"), data, i)[end]
- elseif c2 === '/'
- type = RawElementClose
- depth -= 1
- pop!(ctx) # revert to parent xml:space context
- j = findnext(==(UInt8('>')), data, i)
- else
- j = findnext(==(UInt8('>')), data, i)
- if data[j-1] === UInt8('/')
- type = RawElementSelfClosed
- else
- type = RawElementOpen
- end
- end
- end
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-function next_no_xml_space(o::Raw) # same as v0.3.5
- i = o.pos + o.len + 1
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = [false]
- i = findnext(!xml_isspace, data, i)
- if isnothing(i)
- return nothing
- end
- if type === RawElementOpen || type === RawDocument
- depth += 1
- end
- c = Char(o.data[i])
- d = Char(o.data[i+1])
- if c !== '<'
- type = RawText
- j = findnext(==(UInt8('<')), data, i) - 1
- j = findprev(!xml_isspace, data, j) # "rstrip"
- elseif c === '<'
- c2 = Char(o.data[i+1])
- if c2 === '!'
- c3 = Char(o.data[i+2])
- if c3 === '-'
- type = RawComment
- j = findnext(Vector{UInt8}("-->"), data, i)[end]
- elseif c3 === '['
- type = RawCData
- j = findnext(Vector{UInt8}("]]>"), data, i)[end]
- elseif c3 === 'D' || c3 == 'd'
- type = RawDTD
- j = findnext(==(UInt8('>')), data, i)
- while sum(==(UInt8('>')), @view data[i:j]) != sum(==(UInt8('<')), @view data[i:j])
- j = findnext(==(UInt8('>')), data, j + 1)
- end
- end
- elseif c2 === '?'
- if get_name(data, i + 2)[1] == "xml"
- type = RawDeclaration
- else
- type = RawProcessingInstruction
- end
- j = findnext(Vector{UInt8}("?>"), data, i)[end]
- elseif c2 === '/'
- type = RawElementClose
- depth -= 1
- j = findnext(==(UInt8('>')), data, i)
- else
- j = findnext(==(UInt8('>')), data, i)
- if data[j-1] === UInt8('/')
- type = RawElementSelfClosed
- else
- type = RawElementOpen
- end
- end
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-#-----------------------------------------------------------------------------# prev Raw
-"""
- prev(node) --> typeof(node), Nothing, or Missing (only for XML.Node)
-
-Return the previous node in the document during depth-first traversal. Not defined for `XML.Node`.
-"""
-function prev(o::Raw)
- if o.has_xml_space # using xml:space context at least once in data
- return prev_xml_space(o)
- else # not using xml:space context at all (same as v0.3.5)
- return prev_no_xml_space(o)
- end
-end
-
-function prev_xml_space(o::Raw)
- o.type === RawDocument && return nothing
-
- idx = _ensure_index_upto!(o, o.pos - 1)
- rec = _find_prev_token(idx.recs, o.pos - 1)
- if rec === nothing
- return Raw(o.data, o.has_xml_space, copy(o.ctx))
- end
- return Raw(rec.type, rec.depth, rec.pos, rec.len, o.data, copy(rec.ctx), o.has_xml_space)
-end
-function prev_no_xml_space(o::Raw) # same as v0.3.5
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = has_xml_space ? copy(o.ctx) : [false]
- type === RawDocument && return nothing
- j = o.pos - 1
- j = findprev(!xml_isspace, data, j)
- if isnothing(j)
- return Raw(data, has_xml_space, ctx) # RawDocument
- end
- c = Char(o.data[j])
- next_type = type
- if c !== '>' # text
- type = RawText
- i = findprev(==(UInt8('>')), data, j) + 1
- i = findnext(!xml_isspace, data, i) # "lstrip"
- elseif c === '>'
- c2 = Char(o.data[j-1])
- if c2 === '-'
- type = RawComment
- i = findprev(Vector{UInt8}("<--"), data, j)[1]
- elseif c2 === ']'
- type = RawCData
- i = findprev(Vector{UInt8}(".")
- end
- end
- else
- error("Unreachable reached in XML.prev")
- end
- if type !== RawElementOpen && next_type === RawElementClose
- depth += 1
- elseif type === RawElementOpen && next_type !== RawElementClose
- depth -= 1
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
diff --git a/src/xpath.jl b/src/xpath.jl
new file mode 100644
index 0000000..e56b38d
--- /dev/null
+++ b/src/xpath.jl
@@ -0,0 +1,303 @@
+#-----------------------------------------------------------------------------# XPath
+# A subset of XPath 1.0 for querying XML.Node trees.
+#
+# Supported syntax:
+# / root (absolute path)
+# tag child element by name
+# * any child element
+# // descendant-or-self (recursive)
+# . current node
+# .. parent node
+# [n] positional predicate (1-based)
+# [@attr] has-attribute predicate
+# [@attr='v'] attribute-value predicate
+# text() text node children
+# node() all node children
+# @attr attribute value (returns strings)
+
+#-----------------------------------------------------------------------------# Token types
+
+@enum XPathTokenKind::UInt8 begin
+ XPATH_ROOT # /
+ XPATH_DESCENDANT # //
+ XPATH_NAME # tag name
+ XPATH_WILDCARD # *
+ XPATH_DOT # .
+ XPATH_DOTDOT # ..
+ XPATH_TEXT_FN # text()
+ XPATH_NODE_FN # node()
+ XPATH_PREDICATE # [...]
+ XPATH_ATTRIBUTE # @attr (in result position)
+end
+
+struct XPathToken
+ kind::XPathTokenKind
+ value::String
+end
+
+#-----------------------------------------------------------------------------# Tokenizer
+
+function _xpath_tokenize(expr::AbstractString)
+ tokens = XPathToken[]
+ s = String(expr)
+ i = 1
+ n = ncodeunits(s)
+
+ while i <= n
+ c = s[i]
+
+ if c == '/'
+ if i < n && s[i+1] == '/'
+ push!(tokens, XPathToken(XPATH_DESCENDANT, "//"))
+ i += 2
+ else
+ push!(tokens, XPathToken(XPATH_ROOT, "/"))
+ i += 1
+ end
+
+ elseif c == '.'
+ if i < n && s[i+1] == '.'
+ push!(tokens, XPathToken(XPATH_DOTDOT, ".."))
+ i += 2
+ else
+ push!(tokens, XPathToken(XPATH_DOT, "."))
+ i += 1
+ end
+
+ elseif c == '*'
+ push!(tokens, XPathToken(XPATH_WILDCARD, "*"))
+ i += 1
+
+ elseif c == '['
+ j = findnext(']', s, i + 1)
+ isnothing(j) && error("Unterminated predicate in XPath: $(repr(s))")
+ push!(tokens, XPathToken(XPATH_PREDICATE, SubString(s, i + 1, j - 1)))
+ i = j + 1
+
+ elseif c == '@'
+ j = i + 1
+ while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]))
+ j += 1
+ end
+ j == i + 1 && error("Empty attribute name after @ in XPath: $(repr(s))")
+ push!(tokens, XPathToken(XPATH_ATTRIBUTE, SubString(s, i + 1, j - 1)))
+ i = j
+
+ elseif isletter(c) || c == '_'
+ j = i + 1
+ while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]) || s[j] == '.')
+ j += 1
+ end
+ name = SubString(s, i, j - 1)
+ # Check for function calls: text(), node()
+ if j <= n && s[j] == '('
+ j2 = findnext(')', s, j + 1)
+ isnothing(j2) && error("Unterminated function call in XPath: $(repr(s))")
+ if name == "text"
+ push!(tokens, XPathToken(XPATH_TEXT_FN, "text()"))
+ elseif name == "node"
+ push!(tokens, XPathToken(XPATH_NODE_FN, "node()"))
+ else
+ error("Unknown XPath function: $name()")
+ end
+ i = j2 + 1
+ else
+ push!(tokens, XPathToken(XPATH_NAME, String(name)))
+ i = j
+ end
+
+ elseif isspace(c)
+ i += 1
+
+ else
+ error("Unexpected character '$(c)' in XPath: $(repr(s))")
+ end
+ end
+ tokens
+end
+
+#-----------------------------------------------------------------------------# Predicate evaluation
+
+const _RE_ATTR_PRED = r"^@([A-Za-z_:][\w.\-:]*)$"
+const _RE_ATTR_VAL_PRED = r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$"
+
+function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S
+ s = strip(predicate)
+
+ # Positional: [n]
+ pos = tryparse(Int, s)
+ if !isnothing(pos)
+ 1 <= pos <= length(nodes) || return Node{S}[]
+ return [nodes[pos]]
+ end
+
+ # last()
+ if s == "last()"
+ isempty(nodes) && return Node{S}[]
+ return [nodes[end]]
+ end
+
+ # [@attr] — has attribute
+ m = match(_RE_ATTR_PRED, s)
+ if !isnothing(m)
+ attr_name = m.captures[1]
+ return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes)
+ end
+
+ # [@attr='value'] or [@attr="value"]
+ m = match(_RE_ATTR_VAL_PRED, s)
+ if !isnothing(m)
+ attr_name = m.captures[1]
+ attr_val = m.captures[2]
+ return filter(n -> n.nodetype === Element && get(n, attr_name, nothing) == attr_val, nodes)
+ end
+
+ error("Unsupported XPath predicate: [$predicate]")
+end
+
+#-----------------------------------------------------------------------------# Step evaluation
+
+function _xpath_step(nodes::Vector{Node{S}}, token::XPathToken, root::Node{S}) where S
+ result = Node{S}[]
+ k = token.kind
+
+ if k === XPATH_NAME
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Element && c.tag == token.value && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_WILDCARD
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Element && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_DOT
+ append!(result, nodes)
+
+ elseif k === XPATH_DOTDOT
+ for n in nodes
+ n === root && continue
+ p = _find_parent(n, root)
+ isnothing(p) || push!(result, p)
+ end
+
+ elseif k === XPATH_TEXT_FN
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Text && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_NODE_FN
+ for n in nodes
+ append!(result, children(n))
+ end
+
+ elseif k === XPATH_DESCENDANT
+ # Handled by caller — collects all descendants before next step
+ error("XPATH_DESCENDANT should be handled by the evaluator, not _xpath_step")
+ end
+
+ result
+end
+
+function _descendants!(out::Vector{Node{S}}, node::Node{S}) where S
+ for c in children(node)
+ push!(out, c)
+ _descendants!(out, c)
+ end
+end
+
+function _descendants(nodes::Vector{Node{S}}) where S
+ result = Node{S}[]
+ for n in nodes
+ push!(result, n) # descendant-or-self includes self
+ _descendants!(result, n)
+ end
+ result
+end
+
+#-----------------------------------------------------------------------------# Main evaluator
+
+"""
+ xpath(node::Node, expr::AbstractString) -> Vector{Node}
+
+Evaluate an XPath expression against a `Node` tree and return matching nodes.
+
+Supports a practical subset of XPath 1.0:
+- Absolute (`/root/child`) and relative (`child/sub`) paths
+- Recursive descent (`//tag`)
+- Wildcards (`*`), self (`.`), parent (`..`)
+- Positional predicates (`[1]`, `[last()]`)
+- Attribute predicates (`[@attr]`, `[@attr='value']`)
+- `text()` and `node()` functions
+- Attribute selection (`@attr`) — returns `Text` nodes containing attribute values
+
+# Examples
+```julia
+doc = parse(" ", Node)
+xpath(doc, "/root/a") # both elements
+xpath(doc, "/root/a[1]") # first
+xpath(doc, "//a[@x='2']") #
+xpath(doc, "/root/b/@x") # attribute value as Text node (empty here)
+```
+"""
+function xpath(node::Node{S}, expr::AbstractString) where S
+ tokens = _xpath_tokenize(expr)
+ isempty(tokens) && return Node{S}[]
+
+ # Determine root for .. navigation
+ root = node.nodetype === Document ? node : node
+
+ i = 1
+ # Start context
+ if tokens[1].kind === XPATH_ROOT
+ # Absolute path — start from the document or its root element
+ if node.nodetype === Document
+ current = Node{S}[node]
+ else
+ current = Node{S}[node]
+ end
+ i = 2
+ else
+ current = Node{S}[node]
+ end
+
+ while i <= length(tokens)
+ tok = tokens[i]
+
+ if tok.kind === XPATH_PREDICATE
+ current = _eval_predicate(tok.value, current, root)
+ i += 1
+
+ elseif tok.kind === XPATH_DESCENDANT
+ current = _descendants(current)
+ # // must be followed by a step
+ i += 1
+
+ elseif tok.kind === XPATH_ROOT
+ # / as separator between steps — skip
+ i += 1
+
+ elseif tok.kind === XPATH_ATTRIBUTE
+ # @attr in result position — return attribute values as Text nodes
+ result = Node{S}[]
+ for n in current
+ v = get(n, tok.value, nothing)
+ !isnothing(v) && push!(result, Node{S}(Text, nothing, nothing, v, nothing))
+ end
+ current = result
+ i += 1
+
+ else
+ current = _xpath_step(current, tok, root)
+ i += 1
+ end
+ end
+
+ current
+end
diff --git a/test/Project.toml b/test/Project.toml
index d4883bd..c1703f7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/data/complex_dtd.xml b/test/data/complex_dtd.xml
new file mode 100644
index 0000000..cb69747
--- /dev/null
+++ b/test/data/complex_dtd.xml
@@ -0,0 +1,105 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ notation NOTATION (jpeg | png) #IMPLIED>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+]>
diff --git a/test/data/preserve.xml b/test/data/preserve.xml
new file mode 100644
index 0000000..e77add1
--- /dev/null
+++ b/test/data/preserve.xml
@@ -0,0 +1,5 @@
+
+
+ This node has preserved space
+ with default children.
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 89978eb..49632a0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,646 +1,3208 @@
using XML
-using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape, OrderedDict, h
-using Downloads: download
+using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
+using XML: escape, unescape, h, parse_dtd
+using XML: ParsedDTD, ElementDecl, AttDecl, EntityDecl, NotationDecl
using Test
-import AbstractTrees
-
-AbstractTrees.children(x::Node) = children(x)
-
-#-----------------------------------------------------------------------------# files
-xml_xsd = joinpath("data", "xml.xsd")
-kml_xsd = joinpath("data", "kml.xsd")
-books_xml = joinpath("data", "books.xml")
-example_kml = joinpath("data", "example.kml")
-simple_dtd = joinpath("data", "simple_dtd.xml")
-
-all_files = [xml_xsd, kml_xsd, books_xml, example_kml, simple_dtd]
-
-#-----------------------------------------------------------------------------# h
-@testset "h function" begin
- @test h.tag == XML.Element("tag")
- @test h.tag(id="id") == XML.Element("tag"; id="id")
- @test h.tag(1, 2, a="a", b="b") == XML.Element("tag", 1, 2; a="a", b="b")
-end
-
-#-----------------------------------------------------------------------------# escaping/unescaping
-@testset "escaping/unescaping" begin
- s = "This > string < has & some \" special ' characters"
- @test escape(s) == "This > string < has & some " special ' characters"
- @test escape(escape(s)) == escape(s)
- @test s == unescape(escape(s))
- @test s == unescape(unescape(escape(s)))
-
- n = Element("tag", Text(s))
- @test XML.simple_value(n) == s
-
- XML.escape!(n)
- @test XML.simple_value(n) == escape(s)
-
- XML.unescape!(n)
- @test XML.simple_value(n) == s
-end
-
-#-----------------------------------------------------------------------------# DTD
-# @testset "DTDBody and friends" begin
-# s = read(simple_dtd, String)
-# data = read(simple_dtd)
-
-# dtd = XML.DTDBody(data)
-# dtd2 = parse(s, XML.DTDBody)
-
-# @test length(dtd.elements) == length(dtd2.elements) == 0
-# @test length(dtd.attributes) == length(dtd2.attributes) == 0
-# @test length(dtd.entities) == length(dtd2.entities) == 3
-
-# o = read("data/tv.dtd", XML.DTDBody)
-# end
-
-#-----------------------------------------------------------------------------# Raw
-@testset "Raw tag/attributes/value" begin
- examples = [
- (xml = "",
- nodetype = DTD,
- tag=nothing,
- attributes=nothing,
- value="html"),
- (xml = "",
- nodetype = Declaration,
- tag=nothing,
- attributes=Dict("version" => "1.0", "key" => "value"),
- value=nothing),
- (xml = " ",
- nodetype = Element,
- tag="tag",
- attributes=Dict("_id" => "1", "x" => "abc"),
- value=nothing),
- (xml = "",
- nodetype = Comment,
- tag=nothing,
- attributes=nothing,
- value=" comment "),
- (xml = "",
- nodetype = CData,
- tag=nothing,
- attributes=nothing,
- value="cdata test"),
- ]
- for x in examples
- # @info "Testing: $(x.xml)"
- data = XML.next(XML.parse(x.xml, XML.Raw))
- @test XML.nodetype(data) == x.nodetype
- @test XML.tag(data) == x.tag
- @test XML.attributes(data) == x.attributes
- @test XML.value(data) == x.value
- end
-end
-
-@testset "Raw with books.xml" begin
- data = read(books_xml, XML.Raw)
- doc = collect(data)
- @test length(doc) > countlines(books_xml)
- # Check that the first 5 lines are correct
- first_5_lines = [
- XML.RawDeclaration => """""",
- XML.RawElementOpen => "",
- XML.RawElementOpen => "",
- XML.RawElementOpen => "",
- XML.RawText => "Gambardella, Matthew"
- ]
- for (i, (typ, str)) in enumerate(first_5_lines)
- dt = doc[i]
- @test dt.type == typ
- @test String(dt) == str
- end
- # Check that the last line is correct
- @test doc[end].type == XML.RawElementClose
- @test String(doc[end]) == " "
-
- @testset "next and prev" begin
- @test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx
- @test prev(data) === nothing
- @test XML.next(doc[end]) === nothing
-
- n = length(doc)
- next_res = [doc[1]]
- foreach(_ -> push!(next_res, XML.next(next_res[end])), 1:n-1)
-
- prev_res = [doc[end]]
- foreach(_ -> pushfirst!(prev_res, XML.prev(prev_res[1])), 1:n-1)
-
- idx = findall(next_res .!= prev_res)
-
- for (a,b) in zip(next_res, prev_res)
- @test a == b
- end
-
- lzxml = """ hello hello preserve """
- lz = XML.parse(XML.LazyNode, lzxml)
- n=XML.next(lz)
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == "hello "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == "hello"
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " hello preserve "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == "hello"
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " hello preserve "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " hello "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " preserve "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " preserve "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == ""
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == ""
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " preserve "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " preserve "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " hello "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " hello preserve "
- n=XML.next(n)
- text_content = XML.write(n)
- @test text_content == " hello "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " hello preserve "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == "hello"
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == "hello "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == " "
- n=XML.prev(n)
- text_content = XML.write(n)
- @test text_content == "\n \n hello \n hello preserve \n \n \n "
- end
-
- @testset "depth and parent" begin
- @test XML.depth(data) == 0
- @test isnothing(XML.parent(data))
- @test XML.depth(doc[1]) == 1
- @test XML.parent(doc[1]) == data
- @test XML.depth(doc[2]) == 1
- @test XML.depth(doc[3]) == 2
- @test XML.parent(doc[3]) == doc[2]
- @test XML.depth(doc[end]) == 1
- @test XML.parent(doc[end]) == data
- end
-
- @testset "tag/attributes/value" begin
- x = doc[1] #
- @test XML.tag(x) === nothing
- @test XML.attributes(x) == Dict("version" => "1.0")
- @test XML.value(x) === nothing
-
- x = XML.next(x) #
- @test XML.tag(x) == "catalog"
- @test XML.attributes(x) === nothing
- @test XML.value(x) === nothing
-
- x = XML.next(x) #
- @test XML.tag(x) == "book"
- @test XML.attributes(x) == Dict("id" => "bk101")
- @test XML.value(x) === nothing
-
- x = XML.next(x) #
- @test XML.tag(x) == "author"
- @test XML.attributes(x) === nothing
- @test XML.value(x) === nothing
-
- x = XML.next(x) # Gambardella, Matthew
- @test XML.tag(x) === nothing
- @test XML.attributes(x) === nothing
- @test XML.value(x) == "Gambardella, Matthew"
- end
-end
-
-#-----------------------------------------------------------------------------# Preserve whitespace
-@testset "xml:space" begin
- @testset "Basic xml:space functionality" begin
-
- # Test 1: xml:space="preserve" should preserve entirely empty whitespace
- xml1 = """ """
- doc1 = parse(XML.Node, xml1)
- text_content = XML.value(doc1[1][1][1])
- @test text_content == " "
-
- # Test 2: xml:space="preserve" should preserve leading and trailing whitespace
- xml2 = """ leading and trailing spaces """
- doc2 = parse(XML.Node, xml2)
- text_content = XML.value(doc2[1][1][1])
- @test text_content == " leading and trailing spaces "
-
- # Test 3: Entirely empty tags with and without xml:space="preserve" become self-closing
- xml3 = """ """
- doc3 = XML.parse(XML.Node, xml3)
- text_content = XML.write(doc3[1][1])
- @test text_content == " " # without xml:space="preserve", empty text becomes self-closing
- text_content = XML.value(doc3[1][2][1])
- @test text_content == " " # with xml:space, whitespace is preserved
- text_content = XML.write(doc3[1][3])
- @test text_content == "" # with xml:space="preserve", empty text becomes self-closing
-
- # Test 4: Without xml:space, whitespace should be normalized
- xml4 = """ gets normalized """
- doc4 = XML.parse(XML.Node, xml4)
- text_content = XML.value(doc4[1][1][1])
- @test text_content == "gets normalized"
-
- # Test 5: xml:space="default" should normalize even with preserve_xml_space=true
- xml5 = """ gets normalized """
- doc5 = XML.parse(XML.Node, xml5)
- text_content = XML.value(doc5[1][1][1])
- @test text_content == "gets normalized"
- end
-
- @testset "xml:space inheritance" begin
- # Test 6: Children inherit parent's xml:space="preserve"
- xml6 = """
- parent text
- child text
-
- """
- doc6 = XML.parse(XML.Node, xml6)
- # Both parent and child should preserve whitespace
- @test contains(XML.value(doc6[1][2][1]), "parent text \n")
- @test XML.value(doc6[1][2][2][1]) == " child text "
-
- # Test 7: xml:space="default" overrides parent's "preserve"
- xml7 = """
- normalized despite parent
- """
- doc7 = XML.parse(XML.Node, xml7)
- @test XML.value(doc7[1][2][1]) == "normalized despite parent"
- end
-
- @testset "Nesting scenarios" begin
- # Test 8: Multiple levels of xml:space changes
- xml8 = """
- preserved
- normalized
- preserved again
-
-
- """
- doc8 = XML.parse(XML.Node, xml8)
-
- # level1 should preserve (inherits from root)
- level1_text = XML.value(doc8[1][2][1])
- @test level1_text == " preserved \n "
-
- # level2 should normalize (explicit xml:space="default")
- level2_text = XML.value(doc8[1][2][2][1])
- @test level2_text == "normalized"
-
- # level3 should preserve (explicit xml:space="preserve")
- level3_text = XML.value(doc8[1][2][2][2][1])
- @test level3_text == " preserved again "
-
- # Test 9: repeated multiple levels of xml:space changes
- xml9 = """
- preserved
- normalized
- preserved again
-
-
- preserved b
- normalized b
- preserved again b
-
-
- """
- doc9 = XML.parse(XML.Node, xml9)
-
- # level1b should preserve (inherits from root)
- level1b_text = XML.value(doc9[1][4][1])
- @test level1b_text == " preserved b \n "
-
- # level2 should normalize (explicit xml:space="default")
- level2b_text = XML.value(doc9[1][4][2][1])
- @test level2b_text == "normalized b"
-
- # level3 should preserve (explicit xml:space="preserve")
- level3b_text = XML.value(doc9[1][4][2][2][1])
- @test level3b_text == " preserved again b "
-
- # Test 10: futher repeated multiple levels of xml:space changes
- xml10 = """
- normalized
- normalized b
- preserved
-
-
- normalized c
- preserved b
- normalized again b
- preserved c
-
-
-
- normalized d
- """
- doc10 = XML.parse(XML.Node, xml10)
-
- # level1 should normalize (as root)
- level1_text = XML.value(doc10[end][1][1])
- @test level1_text == "normalized"
-
- # level2 should normalize (as root and level1)
- level2_text = XML.value(doc10[end][1][2][1])
- @test level2_text == "normalized b"
-
- # level3 should preserve (explicit xml:space="preserve")
- level3_text = XML.value(doc10[end][1][2][2][1])
- @test level3_text == " preserved "
-
- # level1b should normalize (as root)
- level1b_text = XML.value(doc10[end][2][1])
- @test level1b_text == "normalized c"
-
- # level2b should preserve (explicit xml:space="preserve")
- level2b_text = XML.value(doc10[end][2][2][1])
- @test level2b_text == " preserved b \n "
-
- # level3 should normalize (explicit xml:space="default")
- level3b_text = XML.value(doc10[end][2][2][2][1])
- @test level3b_text == "normalized again b"
-
- # level3c should preserve (inherited from level2b)
- level3c_text = XML.value(doc10[end][2][2][4][1])
- @test level3c_text == " preserved c \n "
-
- # level1c should normalize (as root)
- level1c_text = XML.value(doc10[end][3][1])
- @test level1c_text == "normalized d"
- end
- @testset "inter-element gap semantics" begin
- # Default parent: gap between siblings should be dropped
- s1 = """ x
- y """
- d1 = XML.parse(XML.Node, s1)
- @test length(d1[1]) == 2
- @test XML.value(d1[1][1][1]) == "x"
- @test XML.value(d1[1][2][1]) == "y"
-
- # Preserve parent, default child ends: gap after default child dropped
- s2 = """
- keep
- norm
- after default gap
- """
- d2 = XML.parse(XML.Node, s2)
- @test length(d2[1]) == 7
- @test XML.value(d2[1][1]) == "\n "
- @test XML.value(d2[1][2][1]) == " keep "
- @test XML.value(d2[1][3]) == "\n "
- @test XML.value(d2[1][4][1]) == "norm"
- @test XML.value(d2[1][5]) == "\n "
- @test XML.value(d2[1][6][1]) == " after default gap "
- @test XML.value(d2[1][7]) == "\n"
- end
- @testset "XML whitespace vs Unicode whitespace" begin
+
+#==============================================================================#
+# ESCAPE / UNESCAPE #
+#==============================================================================#
+@testset "escape / unescape" begin
+ @testset "all five predefined entities" begin
+ @test escape("&") == "&"
+ @test escape("<") == "<"
+ @test escape(">") == ">"
+ @test escape("'") == "'"
+ @test escape("\"") == """
+ end
+
+ @testset "unescape reverses escape" begin
+ @test unescape("&") == "&"
+ @test unescape("<") == "<"
+ @test unescape(">") == ">"
+ @test unescape("'") == "'"
+ @test unescape(""") == "\""
+ end
+
+ @testset "roundtrip on mixed strings" begin
+ s = "This > string < has & some \" special ' characters"
+ @test unescape(escape(s)) == s
+ end
+
+ @testset "idempotent unescape" begin
+ s = "plain text with no entities"
+ @test unescape(s) == s
+ end
+
+ @testset "multiple entities in one string" begin
+ @test escape("a < b & c > d") == "a < b & c > d"
+ @test unescape("a < b & c > d") == "a < b & c > d"
+ end
+
+ @testset "empty string" begin
+ @test escape("") == ""
+ @test unescape("") == ""
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.1: Well-Formed XML Documents #
+#==============================================================================#
+@testset "Spec 2.1: Well-Formed XML Documents" begin
+ # The spec's simplest example:
+ #
+ # Hello, world!
+ xml = """Hello, world! """
+ doc = parse(xml, Node)
+ @test nodetype(doc) == Document
+ @test length(doc) == 2 # Declaration + Element
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ @test tag(doc[2]) == "greeting"
+ @test simple_value(doc[2]) == "Hello, world!"
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.4: Character Data and Markup #
+#==============================================================================#
+@testset "Spec 2.4: Character Data and Markup" begin
+ @testset "text content between tags" begin
+ doc = parse("Hello ", Node)
+ @test simple_value(doc[1]) == "Hello"
+ end
+
+ @testset "entity references in text are unescaped" begin
+ doc = parse("& < > ' " ", Node)
+ @test simple_value(doc[1]) == "& < > ' \""
+ end
+
+ @testset "mixed text and child elements" begin
+ doc = parse("Hello world !
", Node)
+ root = doc[1]
+ @test length(root) == 3
+ @test nodetype(root[1]) == Text
+ @test value(root[1]) == "Hello "
+ @test nodetype(root[2]) == Element
+ @test tag(root[2]) == "b"
+ @test simple_value(root[2]) == "world"
+ @test nodetype(root[3]) == Text
+ @test value(root[3]) == "!"
+ end
+
+ @testset "empty element has no text" begin
+ doc = parse(" ", Node)
+ @test length(children(doc[1])) == 0
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.5: Comments #
+#==============================================================================#
+@testset "Spec 2.5: Comments" begin
+ @testset "basic comment (spec example)" begin
+ # Spec example:
+ doc = parse(" ", Node)
+ c = doc[1][1]
+ @test nodetype(c) == Comment
+ @test value(c) == " declarations for & "
+ end
+
+ @testset "empty comment" begin
+ doc = parse(" ", Node)
+ c = doc[1][1]
+ @test nodetype(c) == Comment
+ @test value(c) == ""
+ end
+
+ @testset "comment before root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[1]) == Comment
+ @test value(doc[1]) == " before "
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "comment after root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[1]) == Element
+ @test nodetype(doc[2]) == Comment
+ end
+
+ @testset "comment with markup-like content preserved verbatim" begin
+ doc = parse(" ", Node)
+ @test value(doc[1][1]) == " not a tag "
+ end
+
+ @testset "multiple comments" begin
+ doc = parse(" ", Node)
+ @test length(doc[1]) == 2
+ @test value(doc[1][1]) == " A "
+ @test value(doc[1][2]) == " B "
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.6: Processing Instructions #
+#==============================================================================#
+@testset "Spec 2.6: Processing Instructions" begin
+ @testset "xml-stylesheet PI (spec example)" begin
+ doc = parse(""" """, Node)
+ pi = doc[1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "xml-stylesheet"
+ @test contains(value(pi), "type=\"text/xsl\"")
+ end
+
+ @testset "PI with no content" begin
+ doc = parse(" ", Node)
+ pi = doc[1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) === nothing
+ end
+
+ @testset "PI inside element" begin
+ doc = parse(" ", Node)
+ pi = doc[1][1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "mypi"
+ @test value(pi) == "some data"
+ end
+
+ @testset "PI after root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[2]) == ProcessingInstruction
+ @test tag(doc[2]) == "post-process"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.7: CDATA Sections #
+#==============================================================================#
+@testset "Spec 2.7: CDATA Sections" begin
+ @testset "CDATA preserves markup characters" begin
+ # Spec example
+ doc = parse("Hello, world!]]> ", Node)
+ cd = doc[1][1]
+ @test nodetype(cd) == CData
+ @test value(cd) == "Hello, world! "
+ end
+
+ @testset "empty CDATA" begin
+ doc = parse(" ", Node)
+ cd = doc[1][1]
+ @test nodetype(cd) == CData
+ @test value(cd) == ""
+ end
+
+ @testset "CDATA with ampersands and less-thans" begin
+ doc = parse(" d]]> ", Node)
+ @test value(doc[1][1]) == "a < b && c > d"
+ end
+
+ @testset "CDATA with special characters" begin
+ doc = parse(" ", Node)
+ @test value(doc[1][1]) == "line1\nline2\ttab"
+ end
+
+ @testset "CDATA mixed with text" begin
+ doc = parse("beforeafter ", Node)
+ @test length(doc[1]) == 3
+ @test nodetype(doc[1][1]) == Text
+ @test value(doc[1][1]) == "before"
+ @test nodetype(doc[1][2]) == CData
+ @test value(doc[1][2]) == "inside"
+ @test nodetype(doc[1][3]) == Text
+ @test value(doc[1][3]) == "after"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.8: Prolog and Document Type Declaration #
+#==============================================================================#
+@testset "Spec 2.8: Prolog and Document Type Declaration" begin
+ @testset "XML declaration - version only" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test nodetype(decl) == Declaration
+ @test decl["version"] == "1.0"
+ end
+
+ @testset "XML declaration - version and encoding" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ end
+
+ @testset "XML declaration - all three pseudo-attributes" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ @test decl["standalone"] == "yes"
+ end
+
+ @testset "XML declaration with single quotes" begin
+ doc = parse(" ", Node)
+ @test doc[1]["version"] == "1.0"
+ end
+
+ @testset "no XML declaration" begin
+ doc = parse(" ", Node)
+ @test length(doc) == 1
+ @test nodetype(doc[1]) == Element
+ end
+
+ @testset "DOCTYPE - SYSTEM" begin
+ # Spec example
+ doc = parse(""" """, Node)
+ dtd = doc[1]
+ @test nodetype(dtd) == DTD
+ @test contains(value(dtd), "greeting")
+ @test contains(value(dtd), "SYSTEM")
+ @test contains(value(dtd), "hello.dtd")
+ end
+
+ @testset "DOCTYPE - with internal subset" begin
+ xml = """
+]>Hello, world! """
+ doc = parse(xml, Node)
+ dtd = doc[1]
+ @test nodetype(dtd) == DTD
+ @test contains(value(dtd), "greeting")
+ @test contains(value(dtd), "
+
+
+]> """
+ doc = parse(xml, Node)
+ @test nodetype(doc[1]) == DTD
+ @test contains(value(doc[1]), "ENTITY")
+ end
+
+ @testset "full prolog: declaration + DOCTYPE" begin
+ xml = """ """
+ doc = parse(xml, Node)
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == DTD
+ @test nodetype(doc[3]) == Element
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.9: Standalone Document Declaration #
+#==============================================================================#
+@testset "Spec 2.9: Standalone Document Declaration" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["standalone"] == "yes"
+
+ doc2 = parse(""" """, Node)
+ @test doc2[1]["standalone"] == "no"
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.10: White Space Handling #
+#==============================================================================#
+@testset "Spec 2.10: White Space Handling" begin
+ @testset "parser preserves all text content verbatim" begin
+ doc = parse(" hello ", Node)
+ @test simple_value(doc[1]) == " hello "
+ end
+
+ @testset "parser preserves whitespace-only text" begin
+ doc = parse(" ", Node)
+ @test simple_value(doc[1]) == " "
+ end
+
+ @testset "parser preserves inter-element whitespace as Text nodes" begin
+ xml = "x \n y "
+ doc = parse(xml, Node)
+ @test length(doc[1]) == 3
+ @test value(doc[1][1][1]) == "x"
+ @test nodetype(doc[1][2]) == Text
+ @test value(doc[1][2]) == "\n "
+ @test value(doc[1][3][1]) == "y"
+ end
+
+ @testset "xml:space attribute is preserved during parsing" begin
+ doc = parse(""" text """, Node)
+ @test doc[1]["xml:space"] == "preserve"
+ @test value(doc[1][1][1]) == " text "
+ end
+
+ @testset "xml:space='preserve' affects write formatting" begin
+ # When xml:space="preserve", writer doesn't add indentation
+ el = Element("s", XML.Text(" pre "), Element("t"), XML.Text(" post "); var"xml:space"="preserve")
+ @test XML.write(el) == " pre post "
+ end
+
+ @testset "write formats with indentation by default" begin
+ el = Element("root", Element("a"), Element("b"))
+ s = XML.write(el)
+ @test contains(s, " ") # indented
+ @test contains(s, " ") # indented
+ end
+
+ @testset "Unicode non-breaking space is NOT XML whitespace" begin
nbsp = "\u00A0"
- s = """
- x\t\n
- $(nbsp) y $(nbsp)
- $(nbsp) z $(nbsp)
- """
- d = XML.parse(XML.Node, s)
- @test XML.value(d[1][1][1]) == "x"
- @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)"
- @test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)"
- end
-
- @testset "CDATA/Comment/PI boundaries" begin
- s = """
- pre post
- pre post
-
- """
- d = XML.parse(XML.Node, s)
- @test XML.value(d[1][1][1]) == "pre"
- @test nodetype(d[1][1][2]) == XML.CData
- @test XML.value(d[1][1][3]) == "post"
- @test XML.value(d[1][2][1]) == " pre "
- @test nodetype(d[1][2][2]) == XML.Comment
- @test XML.value(d[1][2][3]) == " post "
- @test nodetype(d[1][3]) == XML.ProcessingInstruction
- end
-
- @testset "nested toggles and sibling sequences" begin
- s = """
- a
- b
- c
-
- d
- e
-
- """
- d = XML.parse(XML.Node, s)
- @test XML.value(d[1][2][1]) == " a \n "
- @test XML.value(d[1][2][2][1]) == "b"
- @test XML.value(d[1][2][2][2][1]) == " c "
- @test d[1][2][4].tag == "y2"
- @test XML.value(d[1][2][4][1]) == "d"
- @test d[1][2][6].tag == "w"
- @test XML.value(d[1][2][6][1]) == " e "
- end
-
- @testset "root/document boundaries" begin
- s = "\n \n a \n \t "
- d = XML.parse(XML.Node, s)
- @test length(d) == 1
- @test XML.value(d[1][1]) == "a"
- end
-
- @testset "entities expanding to whitespace" begin
- chr1="\u0020"
- chr2="\u000A"
- chr3="\u00A0"
-
- s = """
- $(chr1) a $(chr2)
- $(chr1) b $(chr2)
- $(chr3)c$(chr3)
- """
- d = XML.parse(XML.Node, s)
- @test XML.value(d[1][1][1]) == "a"
- @test XML.value(d[1][2][1]) == " b \n"
- @test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)"
- end
-
- @testset "invalid values and placement" begin
- s_bad = """ t """
- @test_throws ErrorException XML.parse(XML.Node, s_bad)
-
- s_pi = """ t """
- d = XML.parse(XML.Node, s_pi)
- @test XML.value(d[end][1]) == "t"
-
- s_dup = """ t """
-# @test_throws ErrorException XML.parse(XML.Node, s_dup)
- end
-
- @testset "prev()/next() symmetry" begin
- xml = """
- a b c
- d e f
- i
- """
- r = XML.parse(XML.LazyNode, xml).raw
- toks=XML.Raw[]
- while true
- n = XML.next(r)
- n === nothing && break
- push!(toks, n)
- r=n
- end
- back = XML.Raw[]
- r = toks[end]
- while true
- p = XML.prev(r)
- p === nothing && break
- push!(back, p)
- r = p
- end
- @test reverse(back)[2:end] == toks[1:end-1]
- end
-
- @testset "write/read roundtrip extremes" begin
- xml = """
-
-
- r
- pre post
- """
- n = XML.parse(XML.Node, xml)
- io = IOBuffer(); XML.write(io, n)
- n2 = XML.parse(XML.Node, String(take!(io)))
- @test n == n2
- @test XML.write(n2[1][1]) == "
"
- @test XML.write(n2[1][2]) == " "
- @test XML.value(n2[1][3][1]) == "r"
- @test XML.write(n2[1][4]) == " pre post "
- end
-
- @testset "self-closing/empty/whitespace-only children" begin
- s = """
-
-
-
-
- x y
- """
- d = XML.parse(XML.Node, s)
- @test XML.write(d[1][1]) == ""
- @test XML.write(d[1][2]) == ""
- @test XML.value(d[1][3][1]) == " "
- @test XML.value(d[1][5][1]) == "x"
- @test XML.value(d[1][5][3]) == "y"
- end
-
- @testset "allocation guard: small xml:space doc" begin
- xml = " x y "
- f() = XML.parse(XML.Node, xml)
- a = @allocated f()
- @test a < 500_000 # tune for CI
- end
-
-end
-
-#-----------------------------------------------------------------------------# roundtrip
-@testset "read/write/read roundtrip" begin
- for path in all_files
- node = read(path, Node)
- temp = tempname() * ".xml"
- XML.write(temp, node)
- node2 = read(temp, Node)
- @test node == node2
+ xml = "$(nbsp) y $(nbsp) "
+ doc = parse(xml, Node)
+ @test simple_value(doc[1]) == "$(nbsp) y $(nbsp)"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 3.1: Start-Tags, End-Tags, Empty-Element Tags #
+#==============================================================================#
+@testset "Spec 3.1: Start-Tags, End-Tags, Empty-Element Tags" begin
+ @testset "element with attributes (spec example)" begin
+ #
+ doc = parse("""A dog. """, Node)
+ el = doc[1]
+ @test tag(el) == "termdef"
+ @test el["id"] == "dt-dog"
+ @test el["term"] == "dog"
+ @test value(el[1]) == "A dog."
+ end
+
+ @testset "self-closing tag (spec example)" begin
+ #
+ doc = parse(""" """, Node)
+ el = doc[1]
+ @test tag(el) == "IMG"
+ @test el["align"] == "left"
+ @test el["src"] == "http://www.w3.org/Icons/WWW/w3c_home"
+ @test length(children(el)) == 0
+ end
+
+ @testset "simple self-closing tag" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "br"
+ @test length(children(doc[1])) == 0
+ end
+
+ @testset "self-closing tag with space before />" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "br"
+ end
+
+ @testset "empty element with start and end tag" begin
+ doc = parse(" ", Node)
+ el = doc[1]
+ @test tag(el) == "empty"
+ @test isnothing(el.children)
+ end
+
+ @testset "nested elements" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "a"
+ @test tag(doc[1][1]) == "b"
+ @test tag(doc[1][1][1]) == "c"
+ end
+
+ @testset "sibling elements" begin
+ doc = parse(" ", Node)
+ @test length(doc[1]) == 3
+ @test tag(doc[1][1]) == "a"
+ @test tag(doc[1][2]) == "b"
+ @test tag(doc[1][3]) == "c"
+ end
+
+ @testset "attributes with single quotes" begin
+ doc = parse(" ", Node)
+ @test doc[1]["a"] == "val"
+ end
+
+ @testset "attributes with double quotes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "val"
+ end
+
+ @testset "mixed quote styles in attributes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1"
+ @test doc[1]["b"] == "2"
+ end
+
+ @testset "attribute with > in value" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1>2"
+ end
+
+ @testset "attribute with entity reference" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "a&b"
+ end
+
+ @testset "multiple attributes accessible via attributes()" begin
+ doc = parse(""" """, Node)
+ attrs = attributes(doc[1])
+ @test attrs isa Attributes
+ @test attrs["first"] == "1"
+ @test attrs["second"] == "2"
+ @test attrs["third"] == "3"
+ end
- #For debugging:
- for (a,b) in zip(AbstractTrees.Leaves(node), AbstractTrees.Leaves(node2))
- if a != b
- @info path
- @info a
- @info b
- error()
- end
- end
+ @testset "whitespace around = in attributes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1"
end
end
-#-----------------------------------------------------------------------------# Node writing
-@testset "Node writing" begin
- doc = Document(
- DTD("root_tag"),
- Declaration(version=1.0),
- Comment("comment"),
- ProcessingInstruction("xml-stylesheet", href="mystyle.css", type="text/css"),
- Element("root_tag", CData("cdata"), Text("text"))
- )
- @test map(nodetype, children(doc)) == [DTD,Declaration,Comment,ProcessingInstruction,Element]
- @test length(children(doc[end])) == 2
- @test nodetype(doc[end][1]) == XML.CData
- @test nodetype(doc[end][2]) == XML.Text
- @test value(doc[end][1]) == "cdata"
- @test value(doc[end][2]) == "text"
-
- #set/get index for attributes
- o = doc[end]
- @test isempty(keys(o))
- o["id"] = 1
- @test o["id"] == "1"
- @test keys(o) == keys(Dict("id" => "1"))
-end
-
-#-----------------------------------------------------------------------------# Issues
-@testset "Issues" begin
- # https://github.com/JuliaComputing/XML.jl/issues/12: DTD content was cut short
- s = """
-
-
-
- ]>
- """
-
- doc = parse(Node, s)
- @test value(only(doc)) == s[11:end-2] # note [...]
-
- # https://github.com/JuliaComputing/XML.jl/issues/14 (Sorted Attributes)
- kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z'))
- xyz = XML.Element("point"; kw...)
- @test collect(keys(attributes(xyz))) == string.(collect('a':'z'))
+#==============================================================================#
+# XML 1.0 SPEC SECTION 4.1: Entity References #
+#==============================================================================#
+@testset "Spec 4.1: Character and Entity References" begin
+ @testset "predefined entity references in text" begin
+ doc = parse("< ", Node)
+ @test simple_value(doc[1]) == "<"
+
+ doc = parse("> ", Node)
+ @test simple_value(doc[1]) == ">"
+
+ doc = parse("& ", Node)
+ @test simple_value(doc[1]) == "&"
+
+ doc = parse("' ", Node)
+ @test simple_value(doc[1]) == "'"
+
+ doc = parse("" ", Node)
+ @test simple_value(doc[1]) == "\""
+ end
+
+ @testset "predefined entities in attribute values" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "<>&'\""
+ end
+
+ @testset "multiple entity references in one text node" begin
+ doc = parse("<tag> & "value" ", Node)
+ @test simple_value(doc[1]) == " & \"value\""
+ end
+end
+
+#==============================================================================#
+# NAMESPACES (Colon in Tag and Attribute Names) #
+#==============================================================================#
+@testset "Namespaces" begin
+ @testset "namespaced element" begin
+ doc = parse(""" """, Node)
+ @test tag(doc[1]) == "ns:root"
+ @test doc[1]["xmlns:ns"] == "http://example.com"
+ @test tag(doc[1][1]) == "ns:child"
+ end
+
+ @testset "default namespace" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["xmlns"] == "http://example.com"
+ end
+
+ @testset "multiple namespace prefixes" begin
+ xml = """ """
+ doc = parse(xml, Node)
+ @test tag(doc[1][1]) == "a:x"
+ @test tag(doc[1][2]) == "b:y"
+ end
+end
+
+#==============================================================================#
+# NODE CONSTRUCTORS #
+#==============================================================================#
+@testset "Node Constructors" begin
+ @testset "Text" begin
+ t = Text("hello")
+ @test nodetype(t) == Text
+ @test value(t) == "hello"
+ @test tag(t) === nothing
+ @test attributes(t) === nothing
+ end
+
+ @testset "Comment" begin
+ c = Comment(" a comment ")
+ @test nodetype(c) == Comment
+ @test value(c) == " a comment "
+ end
+
+ @testset "CData" begin
+ cd = CData("raw ")
+ @test nodetype(cd) == CData
+ @test value(cd) == "raw "
+ end
+
+ @testset "DTD" begin
+ d = DTD("html")
+ @test nodetype(d) == DTD
+ @test value(d) == "html"
+ end
+
+ @testset "Declaration" begin
+ decl = Declaration(; version="1.0", encoding="UTF-8")
+ @test nodetype(decl) == Declaration
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ end
+
+ @testset "Declaration with no attributes" begin
+ decl = Declaration()
+ @test nodetype(decl) == Declaration
+ @test attributes(decl) === nothing
+ end
+
+ @testset "ProcessingInstruction with content" begin
+ pi = ProcessingInstruction("target", "data here")
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) == "data here"
+ end
+
+ @testset "ProcessingInstruction without content" begin
+ pi = ProcessingInstruction("target")
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) === nothing
+ end
+
+ @testset "Element with tag only" begin
+ el = Element("div")
+ @test nodetype(el) == Element
+ @test tag(el) == "div"
+ @test length(children(el)) == 0
+ end
+
+ @testset "Element with children" begin
+ el = Element("div", Text("hello"), Element("span"))
+ @test length(el) == 2
+ @test nodetype(el[1]) == Text
+ @test nodetype(el[2]) == Element
+ end
+
+ @testset "Element with attributes" begin
+ el = Element("div"; class="main", id="content")
+ @test el["class"] == "main"
+ @test el["id"] == "content"
+ end
+
+ @testset "Element with children and attributes" begin
+ el = Element("a", "click here"; href="http://example.com")
+ @test tag(el) == "a"
+ @test el["href"] == "http://example.com"
+ @test value(el[1]) == "click here"
+ end
+
+ @testset "Element auto-converts non-Node children to Text" begin
+ el = Element("p", 42)
+ @test nodetype(el[1]) == Text
+ @test value(el[1]) == "42"
+ end
+
+ @testset "Document" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ Element("root")
+ )
+ @test nodetype(doc) == Document
+ @test length(doc) == 2
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "Document with all node types" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ DTD("root"),
+ Comment("comment"),
+ ProcessingInstruction("pi", "data"),
+ Element("root", CData("cdata"), Text("text"))
+ )
+ @test map(nodetype, children(doc)) == [Declaration, DTD, Comment, ProcessingInstruction, Element]
+ @test length(doc[end]) == 2
+ @test nodetype(doc[end][1]) == CData
+ @test value(doc[end][1]) == "cdata"
+ @test nodetype(doc[end][2]) == Text
+ @test value(doc[end][2]) == "text"
+ end
+
+ @testset "invalid constructions" begin
+ @test_throws Exception Text("a", "b") # too many args
+ @test_throws Exception Comment("a"; x="1") # no attrs
+ @test_throws Exception CData("a"; x="1") # no attrs
+ @test_throws Exception DTD("a"; x="1") # no attrs
+ @test_throws Exception Element() # need tag
+ @test_throws Exception Declaration("bad") # no positional args
+ @test_throws Exception Document(; x="1") # no attrs
+ @test_throws Exception ProcessingInstruction() # need target
+ @test_throws Exception ProcessingInstruction("a", "b", "c") # too many args
+ end
+end
+
+#==============================================================================#
+# h CONSTRUCTOR #
+#==============================================================================#
+@testset "h constructor" begin
+ @testset "h(tag)" begin
+ el = h("div")
+ @test nodetype(el) == Element
+ @test tag(el) == "div"
+ end
+
+ @testset "h(tag, children...)" begin
+ el = h("div", "hello")
+ @test simple_value(el) == "hello"
+ end
+
+ @testset "h(tag; attrs...)" begin
+ el = h("div"; class="main")
+ @test el["class"] == "main"
+ end
+
+ @testset "h(tag, children...; attrs...)" begin
+ el = h("div", "hello"; class="main")
+ @test el["class"] == "main"
+ @test value(el[1]) == "hello"
+ end
+
+ @testset "h.tag syntax" begin
+ el = h.div("hello"; class="main")
+ @test tag(el) == "div"
+ @test el["class"] == "main"
+ @test value(el[1]) == "hello"
+ end
+
+ @testset "h.tag with no args" begin
+ el = h.br()
+ @test tag(el) == "br"
+ @test length(children(el)) == 0
+ end
+
+ @testset "h.tag with only attrs" begin
+ el = h.img(; src="image.png")
+ @test tag(el) == "img"
+ @test el["src"] == "image.png"
+ end
+
+ @testset "nested h constructors" begin
+ el = h.div(
+ h.h1("Title"),
+ h.p("Paragraph")
+ )
+ @test tag(el) == "div"
+ @test length(el) == 2
+ @test tag(el[1]) == "h1"
+ @test tag(el[2]) == "p"
+ end
+
+ @testset "h with symbol tag" begin
+ el = h(:div)
+ @test tag(el) == "div"
+ end
+end
+
+#==============================================================================#
+# NODE INTERFACE #
+#==============================================================================#
+@testset "Node Interface" begin
+ doc = parse("""text """, Node)
+
+ @testset "nodetype" begin
+ @test nodetype(doc) == Document
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "tag" begin
+ @test tag(doc) === nothing
+ @test tag(doc[2]) == "root"
+ @test tag(doc[2][1]) == "child"
+ end
+
+ @testset "attributes" begin
+ @test attributes(doc) === nothing
+ @test attributes(doc[2])["attr"] == "val"
+ end
+
+ @testset "value" begin
+ @test value(doc) === nothing
+ @test value(doc[2][1][1]) == "text"
+ end
+
+ @testset "children" begin
+ @test length(children(doc)) == 2
+ @test length(children(doc[2])) == 1
+ end
+
+ @testset "is_simple" begin
+ @test is_simple(doc[2][1]) == true
+ @test is_simple(doc[2]) == false
+ end
+
+ @testset "simple_value" begin
+ @test simple_value(doc[2][1]) == "text"
+ @test_throws ErrorException simple_value(doc[2])
+ end
+
+ @testset "simple_value for CData child" begin
+ el = Element("x", CData("data"))
+ @test is_simple(el)
+ @test simple_value(el) == "data"
+ end
+end
+
+#==============================================================================#
+# NODE INDEXING #
+#==============================================================================#
+@testset "Node Indexing" begin
+ doc = parse(" ", Node)
+ root = doc[1]
+
+ @testset "integer indexing" begin
+ @test tag(root[1]) == "a"
+ @test tag(root[2]) == "b"
+ @test tag(root[3]) == "c"
+ end
+
+ @testset "colon indexing" begin
+ all = root[:]
+ @test length(all) == 3
+ end
+
+ @testset "lastindex" begin
+ @test tag(root[end]) == "c"
+ end
+
+ @testset "only" begin
+ single = parse(" ", Node)
+ @test tag(only(single[1])) == "only"
+ end
+
+ @testset "length" begin
+ @test length(root) == 3
+ end
+
+ @testset "attribute indexing" begin
+ el = parse(""" """, Node)[1]
+ @test el["a"] == "1"
+ @test el["b"] == "2"
+ @test_throws KeyError el["nonexistent"]
+ end
+
+ @testset "haskey" begin
+ el = parse(""" """, Node)[1]
+ @test haskey(el, "a") == true
+ @test haskey(el, "b") == false
+ end
+
+ @testset "keys" begin
+ el = parse(""" """, Node)[1]
+ @test collect(keys(el)) == ["a", "b"]
+ end
+
+ @testset "keys on element with no attributes" begin
+ el = parse(" ", Node)[1]
+ @test isempty(keys(el))
+ end
+end
+
+#==============================================================================#
+# NODE MUTATION #
+#==============================================================================#
+@testset "Node Mutation" begin
+ @testset "setindex! child" begin
+ el = Element("root", Element("old"))
+ el[1] = Element("new")
+ @test tag(el[1]) == "new"
+ end
+
+ @testset "setindex! child with auto-conversion" begin
+ el = Element("root", Text("old"))
+ el[1] = "new text"
+ @test value(el[1]) == "new text"
+ end
+
+ @testset "setindex! attribute" begin
+ el = Element("root"; a="1")
+ el["a"] = "2"
+ @test el["a"] == "2"
+ end
+
+ @testset "setindex! new attribute" begin
+ el = Element("root"; a="1")
+ el["b"] = "2"
+ @test el["b"] == "2"
+ end
+
+ @testset "push! child" begin
+ el = Element("root")
+ push!(el, Element("child"))
+ @test length(el) == 1
+ @test tag(el[1]) == "child"
+ end
+
+ @testset "push! with auto-conversion" begin
+ el = Element("root")
+ push!(el, "text")
+ @test nodetype(el[1]) == Text
+ @test value(el[1]) == "text"
+ end
+
+ @testset "pushfirst! child" begin
+ el = Element("root", Element("second"))
+ pushfirst!(el, Element("first"))
+ @test tag(el[1]) == "first"
+ @test tag(el[2]) == "second"
+ end
+
+ @testset "push! on non-container node errors" begin
+ t = Text("hello")
+ @test_throws ErrorException push!(t, "more")
+ end
+end
+
+#==============================================================================#
+# NODE EQUALITY #
+#==============================================================================#
+@testset "Node Equality" begin
+ @testset "identical elements are equal" begin
+ a = Element("div", Text("hello"); class="main")
+ b = Element("div", Text("hello"); class="main")
+ @test a == b
+ end
+
+ @testset "different tag names are not equal" begin
+ @test Element("a") != Element("b")
+ end
+
+ @testset "different attributes are not equal" begin
+ @test Element("a"; x="1") != Element("a"; x="2")
+ end
+
+ @testset "different children are not equal" begin
+ @test Element("a", Text("x")) != Element("a", Text("y"))
+ end
+
+ @testset "different node types are not equal" begin
+ @test Text("x") != Comment("x")
+ end
+
+ @testset "empty attributes vs nothing" begin
+ a = Element("a")
+ b = Element("a")
+ @test a == b
+ end
+
+ @testset "parse equality" begin
+ xml = "text "
+ @test parse(xml, Node) == parse(xml, Node)
+ end
+end
+
+#==============================================================================#
+# XML WRITING #
+#==============================================================================#
+@testset "XML Writing" begin
+ @testset "write Text" begin
+ el = Element("p", "hello & goodbye")
+ @test XML.write(el) == "hello & goodbye
"
+ end
+
+ @testset "write Element with attributes" begin
+ el = Element("div"; class="main", id="content")
+ s = XML.write(el)
+ @test contains(s, "
")
+ end
+
+ @testset "write self-closing element" begin
+ @test XML.write(Element("br")) == " "
+ end
+
+ @testset "write element with single text child (inline)" begin
+ @test XML.write(Element("p", "hello")) == "hello
"
+ end
+
+ @testset "write element with multiple children (indented)" begin
+ el = Element("div", Element("a"), Element("b"))
+ s = XML.write(el)
+ @test contains(s, "")
+ @test contains(s, "
")
+ @test contains(s, "
")
+ @test contains(s, "
")
+ end
+
+ @testset "write Comment" begin
+ el = Element("root", Comment(" comment "))
+ @test contains(XML.write(el), "")
+ end
+
+ @testset "write CData" begin
+ el = Element("root", CData("raw "))
+ @test contains(XML.write(el), "]]>")
+ end
+
+ @testset "write ProcessingInstruction with content" begin
+ pi = ProcessingInstruction("target", "data")
+ @test XML.write(pi) == ""
+ end
+
+ @testset "write ProcessingInstruction without content" begin
+ pi = ProcessingInstruction("target")
+ @test XML.write(pi) == ""
+ end
+
+ @testset "write Declaration" begin
+ decl = Declaration(; version="1.0", encoding="UTF-8")
+ s = XML.write(decl)
+ @test contains(s, "")
+ end
+
+ @testset "write DTD" begin
+ dtd = DTD("html")
+ @test XML.write(dtd) == ""
+ end
+
+ @testset "write Document" begin
+ doc = Document(Declaration(; version="1.0"), Element("root"))
+ s = XML.write(doc)
+ @test startswith(s, "")
+ end
+
+ @testset "write escapes special characters in text" begin
+ el = Element("p", "a < b & c > d")
+ @test XML.write(el) == "a < b & c > d
"
+ end
+
+ @testset "write escapes special characters in attribute values" begin
+ el = Element("x"; a="a\"b")
+ @test contains(XML.write(el), "a=\"a"b\"")
+ end
+
+ @testset "indentsize parameter" begin
+ el = Element("root", Element("child"))
+ s2 = XML.write(el; indentsize=2)
+ s4 = XML.write(el; indentsize=4)
+ @test contains(s2, " ")
+ @test contains(s4, " ")
+ end
+
+ @testset "write xml:space='preserve' respects whitespace" begin
+ el = Element("root", Element("p", Text(" hello "); var"xml:space"="preserve"))
+ s = XML.write(el)
+ @test contains(s, "> hello
")
+ end
+end
+
+#==============================================================================#
+# WRITE TO FILE / READ FROM FILE #
+#==============================================================================#
+@testset "File I/O" begin
+ @testset "write and read back" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ Element("root", Element("child", "text"))
+ )
+ temp = tempname() * ".xml"
+ XML.write(temp, doc)
+ content = read(temp, String)
+ @test contains(content, "")
+ @test contains(content, "