diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5e073ac..b52ab78 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -26,7 +26,7 @@ jobs: - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} @@ -41,9 +41,13 @@ jobs: ${{ runner.os }}-test-${{ env.cache-name }}- ${{ runner.os }}-test- ${{ runner.os }}- + - uses: actions/cache@v4 + with: + path: test/data/w3c + key: w3c-xmlconf-v20130923 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: - file: lcov.info + files: lcov.info diff --git a/.gitignore b/.gitignore index b000475..929dfc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *Manifest.toml -*generated_xsd.jl -*.xml *.gz +*.tar *.DS_Store +*.claude +test/data/w3c/ +benchmarks/data/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3f00ed1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,157 @@ +# Changelog + +All notable changes to XML.jl will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- New streaming tokenizer (`XMLTokenizer` module) for fine-grained XML token iteration. +- XPath support via `xpath(node, path)`. +- `test/test_libxml2_testcases.jl`: 243 test cases borrowed from the [libxml2](https://github.com/GNOME/libxml2) test suite covering CDATA, comments, processing instructions, attributes, namespaces, DTD internal subsets, entity references, whitespace handling, Unicode, error cases, and real-world document patterns. + +### Fixed +- **Tokenizer: multi-byte UTF-8 in attribute values** — Parsing attribute values containing multi-byte UTF-8 characters (e.g., ``) could produce a `StringIndexError` because `attr_value()` used byte arithmetic (`ncodeunits - 1`) instead of `prevind` to strip quotes. The same issue existed in `_read_attr_value!`. +- **Tokenizer: quotes inside DTD comments** — A `"` or `'` character inside a `` comment within a DTD internal subset caused the tokenizer to misinterpret it as a quoted string delimiter, leading to an "Unterminated quoted string" error. The DOCTYPE body parser now correctly skips comment content. + +## [0.3.8] + +### Fixed +- `XML.write` now respects `xml:space="preserve"` and suppresses indentation for elements with this attribute ([#49]). + +## [0.3.7] + +### Fixed +- Resolved remaining issues from [#45] and fixed [#46] (whitespace preservation edge cases) ([#47]). + +## [0.3.6] + +### Added +- `XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation ([#45]). + +### Fixed +- `String` type ambiguity on Julia nightly resolved ([#38]). + +## [0.3.5] + +### Fixed +- `depth` and `parent` functions corrected to work properly with the DOM tree API ([#37]). +- `escape` updated to no longer be idempotent — every `&` is now escaped, matching spec behavior ([#32], addressing [#31]). +- `pushfirst!` support added for `Node` children ([#29]). + +## [0.3.4] + +### Fixed +- Fixed [#26]. +- CI updated to use `julia-actions/cache@v4` and `lts` Julia version. + +## [0.3.3] + +### Added +- `h` constructor for concise element creation (e.g., `h.div("hello"; class="main")`). + +### Fixed +- Path definition error in README example ([#20]). + +## [0.3.2] + +### Fixed +- Minor typos. + +## [0.3.1] + +### Added +- Julia 1.6 compatibility ([#16]). + +### Changed +- Smarter escaping logic. + +## [0.3.0] + +### Changed +- Attribute internal representation changed from `Dict` to `OrderedDict` (later reverted to `Vector{Pair}`). + +## [0.2.3] + +### Fixed +- Parse method fix. + +## [0.2.2] + +### Added +- DTD parsing via `parse_dtd`. +- `is_simple` and `simple_value` exports. +- `setindex!` methods for modifying attributes. +- `unescape` function. + +### Fixed +- DOCTYPE parsing made case-insensitive. + +## [0.2.1] + +### Fixed +- Write output fixes. + +## [0.2.0] + +### Changed +- Major rewrite: introduced `NodeType` enum, `Node{S}` parametric struct, callable `NodeType` constructors, and `XML.write`. +- Processing instruction support. +- Benchmarks added. + +## [0.1.3] + +### Changed +- Improved print output for `AbstractXMLNode`. + +## [0.1.2] + +### Added +- AbstractTrees 0.4 compatibility ([#5]). + +## [0.1.1] + +### Added +- `Node` implementation with `print_tree`. +- Color output in REPL display. +- Stopped stripping whitespace from text nodes. + +## [0.1.0] + +- Initial release. + +[Unreleased]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.8...HEAD +[0.3.8]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.7...v0.3.8 +[0.3.7]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.6...v0.3.7 +[0.3.6]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.5...v0.3.6 +[0.3.5]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.4...v0.3.5 +[0.3.4]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.3...v0.3.4 +[0.3.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.2...v0.3.3 +[0.3.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.1...v0.3.2 +[0.3.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.0...v0.3.1 +[0.3.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.3...v0.3.0 +[0.2.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.3...v0.2.0 +[0.1.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.2...v0.1.3 +[0.1.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.1...v0.1.2 +[0.1.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/JuliaComputing/XML.jl/releases/tag/v0.1.0 + +[#5]: https://github.com/JuliaComputing/XML.jl/pull/5 +[#16]: https://github.com/JuliaComputing/XML.jl/pull/16 +[#20]: https://github.com/JuliaComputing/XML.jl/pull/20 +[#26]: https://github.com/JuliaComputing/XML.jl/issues/26 +[#29]: https://github.com/JuliaComputing/XML.jl/pull/29 +[#31]: https://github.com/JuliaComputing/XML.jl/issues/31 +[#32]: https://github.com/JuliaComputing/XML.jl/pull/32 +[#37]: https://github.com/JuliaComputing/XML.jl/pull/37 +[#38]: https://github.com/JuliaComputing/XML.jl/pull/38 +[#43]: https://github.com/JuliaComputing/XML.jl/issues/43 +[#45]: https://github.com/JuliaComputing/XML.jl/pull/45 +[#46]: https://github.com/JuliaComputing/XML.jl/issues/46 +[#47]: https://github.com/JuliaComputing/XML.jl/pull/47 +[#49]: https://github.com/JuliaComputing/XML.jl/pull/49 diff --git a/Project.toml b/Project.toml index 49b96c0..65925ad 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,7 @@ name = "XML" uuid = "72c71f33-b9b6-44de-8c94-c961784809e2" +version = "0.4.0" authors = ["Josh Day and contributors"] -version = "0.3.8" - -[deps] -Mmap = "a63ad114-7e13-5084-954f-fe012c677804" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" [compat] -OrderedCollections = "1.4, 1.5" -julia = "1.6" +julia = "1.9" diff --git a/README.md b/README.md index ddb1156..e50a591 100644 --- a/README.md +++ b/README.md @@ -4,39 +4,8 @@

Read and write XML in pure Julia.

-

- -# Introduction - -This package offers fast data structures for reading and writing XML files with a consistent interface: -
-### `Node`/`LazyNode` Interface: - -``` -nodetype(node) → XML.NodeType (an enum type) -tag(node) → String or Nothing -attributes(node) → OrderedDict{String, String} or Nothing -value(node) → String or Nothing -children(node) → Vector{typeof(node)} -is_simple(node) → Bool (whether node is simple .e.g. item) -simple_value(node) → e.g. "item" from item) -``` - -
- -### Extended Interface for `LazyNode` - -``` -depth(node) → Int -next(node) → typeof(node) -prev(node) → typeof(node) -parent(node) → typeof(node) -``` - -

- # Quickstart ```julia @@ -58,79 +27,76 @@ doc[end][2] # Second child of root # Node Element (6 children) ``` -

- -# Data Structures that Represent XML Nodes +
-## Preliminary: `NodeType` +# `Node` Interface -- Each item in an XML DOM is classified by its `NodeType`. -- Every `XML.jl` struct defines a `nodetype(x)` method that returns its `NodeType`. +Every node in the XML DOM is represented by `Node`, a single type parametrized on its string storage. -| NodeType | XML Representation | `Node` Constructor | -|----------|--------------------|------------------| -| `Document` | An entire document | `Document(children...)` -| `DTD` | `` | `DTD(...) ` -| `Declaration` | `` | `Declaration(; attrs...)` -| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` -| `Comment` | `` | `Comment(text)` -| `CData` | `` | `CData(text)` -| `Element` | ` children... ` | `Element(tag, children...; attrs...)` -| `Text` | the `text` part of `text` | `Text(text)` +``` +nodetype(node) -> XML.NodeType (an enum) +tag(node) -> String or Nothing +attributes(node) -> Dict{String, String} or Nothing +value(node) -> String or Nothing +children(node) -> Vector{Node} +is_simple(node) -> Bool (e.g. text) +simple_value(node) -> e.g. "text" from text +```
-## `Node`: Probably What You're Looking For - -- `read`-ing a `Node` loads the entire XML DOM in memory. -- See the table above for convenience constructors. -- `Node`s have some additional methods that aid in construction/mutation: - -```julia -# Add a child: -push!(parent::Node, child::Node) +## `NodeType` -# Replace a child: -parent[2] = child +Each item in an XML DOM is classified by its `NodeType`: -# Add/change an attribute: -node["key"] = value +| NodeType | XML Representation | Constructor | +|----------|--------------------|-------------| +| `Document` | An entire document | `Document(children...)` | +| `DTD` | `` | `DTD(...)` | +| `Declaration` | `` | `Declaration(; attrs...)` | +| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` | +| `Comment` | `` | `Comment(text)` | +| `CData` | `` | `CData(text)` | +| `Element` | ` children... ` | `Element(tag, children...; attrs...)` | +| `Text` | the `text` part of `text` | `Text(text)` | -node["key"] -``` +
-- `Node` is an immutable type. However, you can easily create a copy with one or more field values changed by using the `Node(::Node, children...; attrs...)` constructor where `children` are appended to the source node's children and `attrs` are appended to the node's attributes. +## Mutation ```julia -node = XML.Element("tag", "child") -# Node Element (1 child) +push!(parent, child) # Add a child +parent[2] = child # Replace a child +node["key"] = "value" # Add/change an attribute +node["key"] # Get an attribute +``` -simple_value(node) -# "child" +
-node2 = Node(node, "added"; id="my-id") -# Node Element (2 children) +## Tree Navigation -node2.children -# 2-element Vector{Node}: -# Node Text "child" -# Node Text "added" +```julia +depth(child, root) # Depth of child relative to root +parent(child, root) # Parent of child within root's tree +siblings(child, root) # Siblings of child within root's tree ``` -### Writing `Element` `Node`s with `XML.h` +
+ +## Writing Elements with `XML.h` Similar to [Cobweb.jl](https://github.com/JuliaComputing/Cobweb.jl#-creating-nodes-with-cobwebh), `XML.h` enables you to write elements with a simpler syntax: ```julia using XML: h -julia> node = h.parent( - h.child("first child content", id="id1"), - h.child("second child content", id="id2") - ) +node = h.parent( + h.child("first child content", id="id1"), + h.child("second child content", id="id2") +) # Node Element (2 children) -julia> print(XML.write(node)) +print(XML.write(node)) # # first child content # second child content @@ -139,111 +105,193 @@ julia> print(XML.write(node))
-## `XML.LazyNode`: For Fast Iteration through an XML File +# Reading -A lazy data structure that just keeps track of the position in the raw data (`Vector{UInt8}`) to read from. +```julia +# From a file: +read(filename, Node) -- You can iterate over a `LazyNode` to "read" through an XML file: +# From a string: +parse(str, Node) +``` + +
+ +# Writing ```julia -doc = read(filename, LazyNode) - -foreach(println, doc) -# LazyNode Declaration -# LazyNode Element -# LazyNode Element -# LazyNode Element -# LazyNode Text "Gambardella, Matthew" -# LazyNode Element -# ⋮ +XML.write(filename::String, node) # write to file +XML.write(io::IO, node) # write to stream +XML.write(node) # return String ``` -<br><br> +`XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation. -# Reading +<br> -```julia -# Reading from file: -read(filename, Node) -read(filename, LazyNode) +# XPath -# Parsing from string: -parse(Node, str) -parse(LazyNode, str) +Query nodes using a subset of XPath 1.0 via `xpath(node, path)`: +```julia +doc = parse(""" +<root> + <a id="1"><b>hello</b></a> + <a id="2"><b>world</b></a> +</root> +""", Node) + +root = doc[end] + +xpath(root, "//b") # All <b> descendants +xpath(root, "a[@id='2']/b") # <b> inside <a id="2"> +xpath(root, "a[1]") # First <a> child +xpath(root, "//b/text()") # Text nodes inside all <b>s ``` -<br><br> +### Supported syntax + +| Expression | Description | +|------------|-------------| +| `/` | Root / path separator | +| `tag` | Child element by name | +| `*` | Any child element | +| `//` | Descendant-or-self (recursive) | +| `.` | Current node | +| `..` | Parent node | +| `[n]` | Positional predicate (1-based) | +| `[@attr]` | Has-attribute predicate | +| `[@attr='v']` | Attribute-value predicate | +| `text()` | Text node children | +| `node()` | All node children | +| `@attr` | Attribute value (returns strings) | -# Writing +<br> + +# Streaming Tokenizer + +For large files or when you need fine-grained control, `XML.XMLTokenizer` provides a streaming tokenizer that yields tokens without building a DOM: ```julia -XML.write(filename::String, node) # write to file +using XML.XMLTokenizer + +for token in tokenize("<root><child attr=\"val\">text</child></root>") + println(token.kind, " => ", repr(String(token.raw))) +end +# TOKEN_OPEN_TAG => "<root" +# TOKEN_TAG_CLOSE => ">" +# TOKEN_OPEN_TAG => "<child" +# TOKEN_ATTR_NAME => "attr" +# TOKEN_ATTR_VALUE => "\"val\"" +# TOKEN_TAG_CLOSE => ">" +# TOKEN_TEXT => "text" +# TOKEN_CLOSE_TAG => "</child" +# TOKEN_TAG_CLOSE => ">" +# TOKEN_CLOSE_TAG => "</root" +# TOKEN_TAG_CLOSE => ">" +``` -XML.write(io::IO, node) # write to stream +<br> + +# `LazyNode` + +For read-only access without building a full DOM tree, use `LazyNode`. It stores only a reference to the source string and re-tokenizes on demand, using significantly less memory: -XML.write(node) # String +```julia +doc = parse(xml_string, LazyNode) +doc = read("file.xml", LazyNode) ``` +`LazyNode` supports the same read-only interface as `Node`: `nodetype`, `tag`, `attributes`, `value`, `children`, `is_simple`, `simple_value`, plus integer and string indexing. -<br><br> +### Memory-mapped files -# Performance +For very large files, combine `LazyNode` with memory mapping to avoid reading the entire file into heap memory: -- XML.jl performs comparatively to [EzXML.jl](https://github.com/JuliaIO/EzXML.jl), which wraps the C library [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home). -- See the `benchmarks/suite.jl` for the code to produce these results. -- The following output was generated in a Julia session with the following `versioninfo`: +```julia +using XML, Mmap, StringViews -``` -julia> versioninfo() -Julia Version 1.9.4 -Commit 8e5136fa297 (2023-11-14 08:46 UTC) -Build Info: - Official https://julialang.org/ release -Platform Info: - OS: macOS (arm64-apple-darwin22.4.0) - CPU: 10 × Apple M1 Pro - WORD_SIZE: 64 - LIBM: libopenlibm - LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1) - Threads: 8 on 8 virtual cores +doc = open("very_large.xml") do io + sv = StringView(Mmap.mmap(io)) + parse(sv, LazyNode) +end ``` +<br> -### Reading an XML File +# Benchmarks -``` - XML.LazyNode 0.009583 - XML.Node ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1071.32 - EzXML.readxml ■■■■■■■■■ 284.346 - XMLDict.xml_dict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1231.47 -``` +Benchmark source: [benchmarks.jl](benchmarks/benchmarks.jl). Test data: `books.xml` (small, ~4 KB) and a generated XMark auction XML (medium, ~14 MB). -### Writing an XML File -``` - Write: XML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 289.638 - Write: EzXML ■■■■■■■■■■■■■ 93.4631 -``` -### Lazily Iterating over Each Node -``` - LazyNode ■■■■■■■■■ 51.752 - EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 226.271 ``` + Parse (small) — median time (ms) -### Collecting All Names/Tags in an XML File -``` - XML.LazyNode ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 210.482 - EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 276.238 - EzXML.readxml ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 263.269 -``` + XML.jl ■■■■■■■ 0.041 + XML.jl (SS) ■■■■■■ 0.034 + EzXML ■■■■■ 0.030 + LightXML ■■■■■■ 0.033 + XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.232 + + + Parse (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■ 194.2 + XML.jl (SS) ■■■■■■■■■■ 172.8 + EzXML ■■■■■■ 105.8 + LightXML ■■■■■■ 105.0 + XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 687.7 + + + Write (small) — median time (ms) + + XML.jl ■■■■■■■■ 0.021 + EzXML ■■■■ 0.012 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.110 + + + Write (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 93.2 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 84.6 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 60.4 -<br> -<br> -# Possible Gotchas + Read file — median time (ms) -- XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you. However, we provide utility functions for doing the conversions back and forth: - - `XML.escape(::String)` and `XML.unescape(::String)` - - `XML.escape!(::Node)` and `XML.unescape!(::Node)`. + XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 214.1 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■ 143.1 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■ 121.9 + + + Collect tags (small) — median time (ms) + + XML.jl ■■■■■■ 0.000698 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■ 0.00255 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00430 + + + Collect tags (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■■■■■■■■ 12.6 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 20.5 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 27.6 +``` + +```julia +versioninfo() +# Julia Version 1.12.5 +# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml index ed90996..043988c 100644 --- a/benchmarks/Project.toml +++ b/benchmarks/Project.toml @@ -2,7 +2,8 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" XML = "72c71f33-b9b6-44de-8c94-c961784809e2" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl new file mode 100644 index 0000000..7f780a0 --- /dev/null +++ b/benchmarks/XMarkGenerator.jl @@ -0,0 +1,377 @@ +""" + XMarkGenerator + +XMark-inspired XML benchmark data generator. Produces well-formed XML documents modeling an +internet auction site, following the XMark benchmark DTD structure. + + include("xml_generator.jl") + using .XMarkGenerator + + xml = generate_xmark(1.0) # return String (~14 MB) + generate_xmark("out.xml", 5.0) # write to file (~68 MB) + generate_xmark(stdout, 0.1; seed=123) # write to IO (~1.4 MB) +""" +module XMarkGenerator + +using Random + +export generate_xmark + +#-----------------------------------------------------------------# Word lists +const WORDS = [ + "about", "above", "across", "after", "again", "against", "along", "already", "also", + "always", "among", "another", "answer", "around", "asked", "away", "back", "because", + "become", "been", "before", "began", "behind", "being", "below", "between", "body", + "book", "both", "brought", "build", "built", "business", "came", "cannot", "carry", + "cause", "certain", "change", "children", "city", "close", "come", "complete", "could", + "country", "course", "cover", "current", "dark", "days", "deep", "development", + "different", "direction", "does", "done", "door", "down", "draw", "during", "each", + "early", "earth", "east", "education", "effort", "eight", "either", "else", "end", + "enough", "even", "every", "example", "experience", "face", "fact", "family", "feel", + "field", "find", "first", "five", "follow", "food", "force", "form", "found", "four", + "from", "full", "gave", "general", "give", "going", "gone", "good", "government", + "great", "green", "ground", "group", "grow", "half", "hand", "happen", "hard", "have", + "head", "help", "here", "high", "himself", "hold", "home", "hope", "house", "however", + "hundred", "idea", "important", "inch", "include", "increase", "island", "just", "keep", + "kind", "knew", "know", "land", "large", "last", "later", "learn", "left", "less", + "letter", "life", "light", "like", "line", "list", "little", "live", "long", "look", + "lost", "made", "main", "make", "many", "mark", "matter", "mean", "might", "mind", + "miss", "money", "morning", "most", "mother", "move", "much", "music", "must", "name", + "near", "need", "never", "next", "night", "nothing", "notice", "number", "often", + "once", "only", "open", "order", "other", "over", "page", "paper", "part", "past", + "pattern", "people", "perhaps", "period", "person", "picture", "place", "plan", "plant", + "play", "point", "position", "possible", "power", "present", "problem", "produce", + "product", "program", "public", "pull", "purpose", "question", "quite", "reach", "read", + "real", "receive", "record", "remember", "rest", "result", "right", "river", "room", + "round", "rule", "same", "school", "second", "seem", "sentence", "service", "seven", + "several", "shall", "short", "should", "show", "side", "since", "sing", "size", "small", + "social", "some", "song", "soon", "south", "space", "stand", "start", "state", "still", + "stood", "story", "strong", "study", "such", "sure", "system", "table", "take", "tell", + "test", "their", "them", "then", "there", "these", "thing", "think", "those", "thought", + "three", "through", "time", "together", "took", "toward", "travel", "tree", "true", + "turn", "under", "unit", "until", "upon", "usually", "value", "very", "voice", "walk", + "want", "watch", "water", "well", "went", "were", "west", "what", "where", "which", + "while", "white", "whole", "will", "with", "without", "woman", "word", "work", "world", + "would", "write", "year", "young", +] +const FIRST_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard", + "Joseph", "Thomas", "Charles", "Mary", "Patricia", "Jennifer", "Linda", "Barbara", + "Elizabeth", "Susan", "Jessica", "Sarah", "Karen"] +const LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", + "Davis", "Rodriguez", "Martinez", "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez", + "Moore", "Martin", "Jackson", "Thompson", "White"] +const COUNTRIES = ["United States", "Germany", "France", "Japan", "Australia", "Brazil", + "Canada", "India", "China", "Mexico", "Argentina", "Spain", "Italy", "United Kingdom", + "Netherlands", "Sweden", "Norway", "Finland", "Denmark", "Belgium"] +const CITIES = ["New York", "London", "Paris", "Tokyo", "Sydney", "Berlin", "Rome", + "Madrid", "Amsterdam", "Toronto", "Moscow", "Beijing", "Seoul", "Mumbai", "Cairo", + "Dublin", "Prague", "Vienna", "Warsaw", "Budapest"] +const STREETS = ["Main", "Oak", "Elm", "Maple", "Pine", "Cedar", "Birch", "Walnut", + "Cherry", "Ash", "Spruce", "Willow", "Poplar", "Laurel", "Juniper"] +const EDUCATIONS = ["High School", "College", "Graduate", "Associate", "Master", "Doctorate"] +const GENDERS = ["male", "female"] +const PAYMENTS = ["Creditcard", "Money order", "Personal check", "Cash"] +const SHIPPING = ["Will ship only within country", "Will ship internationally", + "Buyer pays fixed shipping costs", "Free shipping", "See description for shipping"] +const REGIONS = ["africa", "asia", "australia", "europe", "namerica", "samerica"] + +#-----------------------------------------------------------------# Random data helpers +rand_word(rng) = rand(rng, WORDS) +rand_date(rng) = string(rand(rng, 1999:2025), "/", lpad(rand(rng, 1:12), 2, '0'), "/", lpad(rand(rng, 1:28), 2, '0')) +rand_time(rng) = string(lpad(rand(rng, 0:23), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0')) +rand_price(rng) = string(rand(rng, 1:9999), ".", lpad(rand(rng, 0:99), 2, '0')) +rand_phone(rng) = string("+", rand(rng, 1:99), " (", rand(rng, 100:999), ") ", rand(rng, 1000000:9999999)) +rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0')) +rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ") +rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com") + +#-----------------------------------------------------------------# XML writing helpers +function xml_escape_char(io::IO, c::Char) + if c == '&'; print(io, "&") + elseif c == '<'; print(io, "<") + elseif c == '>'; print(io, ">") + elseif c == '"'; print(io, """) + else; print(io, c) + end +end + +function write_escaped(io::IO, s::AbstractString) + for c in s + xml_escape_char(io, c) + end +end + +function write_text_content(rng, io; min_words=10, max_words=50) + n = rand(rng, min_words:max_words) + for i in 1:n + i > 1 && print(io, ' ') + w = rand_word(rng) + r = rand(rng) + if r < 0.03 + print(io, "<bold>", w, "</bold>") + elseif r < 0.06 + print(io, "<emph>", w, "</emph>") + elseif r < 0.08 + print(io, "<keyword>", w, "</keyword>") + else + print(io, w) + end + end +end + +function write_description(rng, io, indent) + println(io, indent, "<description>") + if rand(rng) < 0.7 + print(io, indent, " <text>") + write_text_content(rng, io; min_words=15, max_words=80) + println(io, "</text>") + else + println(io, indent, " <parlist>") + for _ in 1:rand(rng, 2:6) + print(io, indent, " <listitem><text>") + write_text_content(rng, io; min_words=8, max_words=40) + println(io, "</text></listitem>") + end + println(io, indent, " </parlist>") + end + println(io, indent, "</description>") +end + +function write_annotation(rng, io, indent, n_people) + println(io, indent, "<annotation>") + println(io, indent, " <author person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + write_description(rng, io, string(indent, " ")) + println(io, indent, " <happiness>", rand(rng, 1:10), "</happiness>") + println(io, indent, "</annotation>") +end + +#-----------------------------------------------------------------# Section writers +function write_item(rng, io, id, n_categories) + featured = rand(rng) < 0.1 ? " featured=\"yes\"" : "" + println(io, " <item id=\"", string("item",id), "\"", featured, ">") + println(io, " <location>", rand(rng, CITIES), "</location>") + println(io, " <quantity>", rand(rng, 1:50), "</quantity>") + println(io, " <name>", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), "</name>") + println(io, " <payment>", rand(rng, PAYMENTS), "</payment>") + write_description(rng, io, " ") + println(io, " <shipping>", rand(rng, SHIPPING), "</shipping>") + for _ in 1:rand(rng, 1:3) + println(io, " <incategory category=\"", string("category",rand(rng, 1:n_categories)), "\"/>") + end + println(io, " <mailbox>") + for _ in 1:rand(rng, 0:5) + println(io, " <mail>") + println(io, " <from>", rand_email(rng), "</from>") + println(io, " <to>", rand_email(rng), "</to>") + println(io, " <date>", rand_date(rng), "</date>") + print(io, " <text>") + write_text_content(rng, io; min_words=10, max_words=60) + println(io, "</text>") + println(io, " </mail>") + end + println(io, " </mailbox>") + println(io, " </item>") +end + +function write_categories(rng, io, n) + println(io, " <categories>") + for i in 1:n + println(io, " <category id=\"", string("category",i), "\">") + println(io, " <name>", rand_word(rng), " ", rand_word(rng), "</name>") + write_description(rng, io, " ") + println(io, " </category>") + end + println(io, " </categories>") +end + +function write_catgraph(rng, io, n_edges, n_categories) + println(io, " <catgraph>") + for _ in 1:n_edges + from = string("category",rand(rng, 1:n_categories)) + to = string("category",rand(rng, 1:n_categories)) + println(io, " <edge from=\"", from, "\" to=\"", to, "\"/>") + end + println(io, " </catgraph>") +end + +function write_people(rng, io, n, n_categories, n_open) + println(io, " <people>") + for i in 1:n + println(io, " <person id=\"", string("person",i), "\">") + println(io, " <name>", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), "</name>") + println(io, " <emailaddress>", rand_email(rng), "</emailaddress>") + if rand(rng) < 0.8 + println(io, " <phone>", rand_phone(rng), "</phone>") + end + if rand(rng) < 0.7 + println(io, " <address>") + println(io, " <street>", rand(rng, 1:9999), " ", rand(rng, STREETS), " St</street>") + println(io, " <city>", rand(rng, CITIES), "</city>") + println(io, " <country>", rand(rng, COUNTRIES), "</country>") + if rand(rng) < 0.5 + println(io, " <province>", rand_word(rng), "</province>") + end + println(io, " <zipcode>", rand_zip(rng), "</zipcode>") + println(io, " </address>") + end + if rand(rng) < 0.5 + println(io, " <homepage>http://www.", lowercase(rand(rng, LAST_NAMES)), ".com/~", + lowercase(rand(rng, FIRST_NAMES)), "</homepage>") + end + if rand(rng) < 0.6 + println(io, " <creditcard>", rand_cc(rng), "</creditcard>") + end + if rand(rng) < 0.7 + income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : "" + println(io, " <profile", income, ">") + for _ in 1:rand(rng, 0:4) + println(io, " <interest category=\"", string("category",rand(rng, 1:n_categories)), "\"/>") + end + if rand(rng) < 0.8 + println(io, " <education>", rand(rng, EDUCATIONS), "</education>") + end + if rand(rng) < 0.7 + println(io, " <gender>", rand(rng, GENDERS), "</gender>") + end + println(io, " <business>", rand_word(rng), "</business>") + if rand(rng) < 0.8 + println(io, " <age>", rand(rng, 18:85), "</age>") + end + println(io, " </profile>") + end + if n_open > 0 && rand(rng) < 0.3 + println(io, " <watches>") + for _ in 1:rand(rng, 1:5) + println(io, " <watch open_auction=\"", string("open_auction",rand(rng, 1:n_open)), "\"/>") + end + println(io, " </watches>") + end + println(io, " </person>") + end + println(io, " </people>") +end + +function write_open_auctions(rng, io, n, n_items, n_people) + println(io, " <open_auctions>") + for i in 1:n + println(io, " <open_auction id=\"", string("open_auction",i), "\">") + println(io, " <initial>", rand_price(rng), "</initial>") + if rand(rng) < 0.5 + println(io, " <reserve>", rand_price(rng), "</reserve>") + end + for _ in 1:rand(rng, 0:12) + println(io, " <bidder>") + println(io, " <date>", rand_date(rng), "</date>") + println(io, " <time>", rand_time(rng), "</time>") + println(io, " <personref person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + println(io, " <increase>", rand_price(rng), "</increase>") + println(io, " </bidder>") + end + println(io, " <current>", rand_price(rng), "</current>") + if rand(rng) < 0.3 + println(io, " <privacy>", rand(rng, ["Yes", "No"]), "</privacy>") + end + println(io, " <itemref item=\"", string("item",rand(rng, 1:n_items)), "\"/>") + println(io, " <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + write_annotation(rng, io, " ", n_people) + println(io, " <quantity>", rand(rng, 1:10), "</quantity>") + println(io, " <type>", rand(rng, ["Regular", "Featured"]), "</type>") + println(io, " <interval>") + println(io, " <start>", rand_date(rng), "</start>") + println(io, " <end>", rand_date(rng), "</end>") + println(io, " </interval>") + println(io, " </open_auction>") + end + println(io, " </open_auctions>") +end + +function write_closed_auctions(rng, io, n, n_open, n_items, n_people) + println(io, " <closed_auctions>") + for i in 1:n + println(io, " <closed_auction>") + println(io, " <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + println(io, " <buyer person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + # Use item IDs that don't overlap with open auctions + item_id = n_open + i + item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items) + println(io, " <itemref item=\"", string("item",item_id), "\"/>") + println(io, " <price>", rand_price(rng), "</price>") + println(io, " <date>", rand_date(rng), "</date>") + println(io, " <quantity>", rand(rng, 1:10), "</quantity>") + println(io, " <type>", rand(rng, ["Regular", "Featured"]), "</type>") + if rand(rng) < 0.7 + write_annotation(rng, io, " ", n_people) + end + println(io, " </closed_auction>") + end + println(io, " </closed_auctions>") +end + +#-----------------------------------------------------------------# Main entry points +""" + generate_xmark([io_or_filename], factor; seed=42) + +Generate an XMark-style auction XML document. `factor` scales all entity counts linearly. + +Approximate output sizes (may vary slightly): +- `factor=0.1` → ~1.4 MB +- `factor=1.0` → ~14 MB +- `factor=2.0` → ~27 MB +- `factor=5.0` → ~68 MB +""" +function generate_xmark(io::IO, factor::Real; seed::Int=42) + factor > 0 || throw(ArgumentError("factor must be positive, got $factor")) + rng = Xoshiro(seed) + + n_per_region = max(1, round(Int, 500 * factor)) + n_people = max(1, round(Int, 5000 * factor)) + n_categories = max(1, round(Int, 200 * factor)) + n_open = max(1, round(Int, 2000 * factor)) + n_closed = max(1, round(Int, 1500 * factor)) + n_edges = max(1, round(Int, 1000 * factor)) + n_items = n_per_region * 6 + + # Clamp auctions to available items + n_open = min(n_open, n_items) + n_closed = min(n_closed, max(1, n_items - n_open)) + + println(io, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") + println(io, "<site>") + + # Regions with items + println(io, " <regions>") + item_id = 0 + for region in REGIONS + println(io, " <", region, ">") + for _ in 1:n_per_region + item_id += 1 + write_item(rng, io, item_id, n_categories) + end + println(io, " </", region, ">") + end + println(io, " </regions>") + + write_categories(rng, io, n_categories) + write_catgraph(rng, io, n_edges, n_categories) + write_people(rng, io, n_people, n_categories, n_open) + write_open_auctions(rng, io, n_open, n_items, n_people) + write_closed_auctions(rng, io, n_closed, n_open, n_items, n_people) + + println(io, "</site>") + nothing +end + +function generate_xmark(filename::AbstractString, factor::Real; seed::Int=42) + open(filename, "w") do io + generate_xmark(io, factor; seed) + end + filename +end + +function generate_xmark(factor::Real; seed::Int=42) + io = IOBuffer() + generate_xmark(io, factor; seed) + String(take!(io)) +end + +end # module diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl new file mode 100644 index 0000000..16096e8 --- /dev/null +++ b/benchmarks/benchmarks.jl @@ -0,0 +1,161 @@ +using XML +using XML: Element, nodetype, tag, children +using EzXML: EzXML +using XMLDict: XMLDict +using LightXML: LightXML +using BenchmarkTools +using DataFrames +using InteractiveUtils + +include("XMarkGenerator.jl") +using .XMarkGenerator + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000 + +#-----------------------------------------------------------------------------# Test data +# Small file (~120 lines) +small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml") +small_xml = read(small_file, String) + +# Medium file (generated XMark auction XML, ~14 MB) +medium_file = joinpath(@__DIR__, "data", "xmark.xml") +if !isfile(medium_file) + mkpath(dirname(medium_file)) + @info "Generating XMark benchmark XML..." + generate_xmark(medium_file, 1.0) +end +medium_xml = read(medium_file, String) + +df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[]) + +macro add_benchmark(kind, name, expr...) + esc(:(let + @info string($kind, " - ", $name) + bench = @benchmark $(expr...) + push!(df, (; kind=$kind, name=$name, bench)) + end)) +end + +const SSNode = Node{SubString{String}} + +#-----------------------------------------------------------------------------# Parse (small) +@add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node) +@add_benchmark "Parse (small)" "XML.jl (SS)" parse($small_xml, SSNode) +@add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml) +@add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml) +@add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml) + +#-----------------------------------------------------------------------------# Parse (medium) +@add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node) +@add_benchmark "Parse (medium)" "XML.jl (SS)" parse($medium_xml, SSNode) +@add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml) +@add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml) +@add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml) + +#-----------------------------------------------------------------------------# Write (small) +@add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node)) +@add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml)) +@add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true)) + +#-----------------------------------------------------------------------------# Write (medium) +@add_benchmark "Write (medium)" "XML.jl" XML.write(o) setup=(o = parse(medium_xml, Node)) +@add_benchmark "Write (medium)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(medium_xml)) +@add_benchmark "Write (medium)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(medium_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true)) + +#-----------------------------------------------------------------------------# Read from file +@add_benchmark "Read file" "XML.jl" read($medium_file, Node) +@add_benchmark "Read file" "EzXML" EzXML.readxml($medium_file) +@add_benchmark "Read file" "LightXML" LightXML.parse_file($medium_file) + +#-----------------------------------------------------------------------------# Collect element tags +function xml_collect_tags(node) + out = String[] + _xml_collect_tags!(out, node) + out +end +function _xml_collect_tags!(out, node) + for c in children(node) + if nodetype(c) === Element + push!(out, tag(c)) + _xml_collect_tags!(out, c) + end + end +end + +function ezxml_collect_tags(node::EzXML.Node) + out = String[] + _ezxml_collect_tags!(out, node) + out +end +function _ezxml_collect_tags!(out, node::EzXML.Node) + for child in EzXML.eachelement(node) + push!(out, child.name) + _ezxml_collect_tags!(out, child) + end +end + +function lightxml_collect_tags(root::LightXML.XMLElement) + out = String[] + _lightxml_collect_tags!(out, root) + out +end +function _lightxml_collect_tags!(out, el::LightXML.XMLElement) + for child in LightXML.child_elements(el) + push!(out, LightXML.name(child)) + _lightxml_collect_tags!(out, child) + end +end + +@add_benchmark "Collect tags (small)" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node)) +@add_benchmark "Collect tags (small)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml)) +@add_benchmark "Collect tags (small)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o)) + +@add_benchmark "Collect tags (medium)" "XML.jl" xml_collect_tags(o) setup=(o = parse(medium_xml, Node)) +@add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml)) +@add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o)) + +#-----------------------------------------------------------------------------# Write benchmarks_results.md +_fmt_ms(t) = string(round(t, sigdigits=3), " ms") + +function _compare_indicator(xml_ms, other_ms) + ratio = xml_ms / other_ms + pct = abs(round((ratio - 1) * 100, digits=1)) + ratio > 1.05 ? "(XML.jl $(pct)% slower)" : ratio < 0.95 ? "(XML.jl $(pct)% faster)" : "(~same)" +end + +outfile = joinpath(@__DIR__, "benchmarks_results.md") +open(outfile, "w") do io + println(io, "# XML.jl Benchmarks\n") + println(io, "```") + for kind in unique(df.kind) + g = groupby(df, :kind) + haskey(g, (;kind)) || continue + sub = g[(;kind)] + println(io, kind) + # Find XML.jl baseline (first row starting with "XML.jl") + xml_row = findfirst(r -> startswith(r.name, "XML.jl") && !contains(r.name, "(SS)"), eachrow(sub)) + xml_ms = isnothing(xml_row) ? nothing : median(sub[xml_row, :bench]).time / 1e6 + for row in eachrow(sub) + ms = median(row.bench).time / 1e6 + indicator = "" + if !isnothing(xml_ms) && !startswith(row.name, "XML.jl") + indicator = " " * _compare_indicator(xml_ms, ms) + end + println(io, "\t", rpad(row.name, 16), lpad(_fmt_ms(ms), 12), indicator) + end + println(io) + end + println(io, "```") + + println(io, "\n```julia") + println(io, "versioninfo()") + buf = IOBuffer() + InteractiveUtils.versioninfo(buf) + for line in eachline(IOBuffer(take!(buf))) + println(io, "# ", line) + end + println(io, "```") +end + +println("Results written to $outfile") diff --git a/benchmarks/benchmarks_results.md b/benchmarks/benchmarks_results.md new file mode 100644 index 0000000..d03ae50 --- /dev/null +++ b/benchmarks/benchmarks_results.md @@ -0,0 +1,60 @@ +# XML.jl Benchmarks + +``` +Parse (small) + XML.jl 0.0339 ms + XML.jl (SS) 0.0301 ms + EzXML 0.0254 ms (33.5% slower) + LightXML 0.0282 ms (20.1% slower) + XMLDict 0.204 ms (83.4% faster) + +Parse (medium) + XML.jl 170.0 ms + XML.jl (SS) 154.0 ms + EzXML 91.0 ms (87.0% slower) + LightXML 92.8 ms (83.4% slower) + XMLDict 623.0 ms (72.7% faster) + +Write (small) + XML.jl 0.0179 ms + EzXML 0.0107 ms (68.0% slower) + LightXML 0.0926 ms (80.6% faster) + +Write (medium) + XML.jl 81.2 ms + EzXML 73.2 ms (11.0% slower) + LightXML 55.1 ms (47.5% slower) + +Read file + XML.jl 180.0 ms + EzXML 129.0 ms (39.9% slower) + LightXML 104.0 ms (73.4% slower) + +Collect tags (small) + XML.jl 0.000597 ms + EzXML 0.00219 ms (72.7% faster) + LightXML 0.00371 ms (83.9% faster) + +Collect tags (medium) + XML.jl 12.2 ms + EzXML 28.2 ms (56.9% faster) + LightXML 25.7 ms (52.6% faster) + +``` + +```julia +versioninfo() +# Julia Version 1.12.5 +# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl new file mode 100644 index 0000000..7dca16e --- /dev/null +++ b/benchmarks/compare.jl @@ -0,0 +1,224 @@ +#= Compare current dev XML.jl against the last released version. + +Usage: + julia benchmarks/compare.jl [tag] + +`tag` defaults to the latest git tag (e.g. v0.3.8). + +This script: +1. Runs benchmarks using the current (dev) code +2. Checks out the release tag into a temp worktree +3. Runs the same benchmarks against that version +4. Prints a side-by-side comparison +=# + +using BenchmarkTools, Serialization, InteractiveUtils + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 + +const ROOT = dirname(@__DIR__) + +const RELEASE_TAG = if length(ARGS) >= 1 + ARGS[1] +else + tags = readlines(`git -C $ROOT tag --sort=version:refname`) + filter!(t -> startswith(t, "v"), tags) + last(tags) +end + +const SMALL_FILE = joinpath(ROOT, "test", "data", "books.xml") +const SMALL_XML = read(SMALL_FILE, String) + +# Generate medium file if needed +include(joinpath(ROOT, "benchmarks", "XMarkGenerator.jl")) +using .XMarkGenerator +const MEDIUM_FILE = joinpath(ROOT, "benchmarks", "data", "xmark.xml") +if !isfile(MEDIUM_FILE) + mkpath(dirname(MEDIUM_FILE)) + @info "Generating XMark benchmark XML..." + generate_xmark(MEDIUM_FILE, 1.0) +end +const MEDIUM_XML = read(MEDIUM_FILE, String) + +#-----------------------------------------------------------------------------# Helpers +function _collect_tags!(out, node) + for c in XML.children(node) + if XML.nodetype(c) === XML.Element + push!(out, XML.tag(c)) + _collect_tags!(out, c) + end + end +end + +function bench_collect_tags(node) + out = String[] + _collect_tags!(out, node) + out +end + +#-----------------------------------------------------------------------------# Run dev benchmarks +println("="^60) +println(" XML.jl Benchmark Comparison") +println(" Current (dev) vs $RELEASE_TAG") +println("="^60) +println() + +print("Running dev benchmarks...") +flush(stdout) + +using XML + +dev_results = Dict{String, BenchmarkTools.Trial}() + +const SSNode = Node{SubString{String}} + +dev_small = parse(SMALL_XML, Node) +dev_small_ss = parse(SMALL_XML, SSNode) +dev_medium = parse(MEDIUM_XML, Node) +dev_medium_ss = parse(MEDIUM_XML, SSNode) + +dev_results["Parse (small), String"] = @benchmark parse($SMALL_XML, Node) +dev_results["Parse (small), SubString"] = @benchmark parse($SMALL_XML, SSNode) +dev_results["Parse (medium), String"] = @benchmark parse($MEDIUM_XML, Node) +dev_results["Parse (medium), SubString"] = @benchmark parse($MEDIUM_XML, SSNode) +dev_results["Write (small)"] = @benchmark XML.write($dev_small) +dev_results["Write (medium)"] = @benchmark XML.write($dev_medium) +dev_results["Read file (medium), String"] = @benchmark read($MEDIUM_FILE, Node) +dev_results["Read file (medium), SubString"] = @benchmark parse(read($MEDIUM_FILE, String), SSNode) +dev_results["Collect tags (small), String"] = @benchmark bench_collect_tags($dev_small) +dev_results["Collect tags (small), SubString"] = @benchmark bench_collect_tags($dev_small_ss) +dev_results["Collect tags (medium), String"] = @benchmark bench_collect_tags($dev_medium) +dev_results["Collect tags (medium), SubString"] = @benchmark bench_collect_tags($dev_medium_ss) + +println(" done") + +#-----------------------------------------------------------------------------# Run release benchmarks via temp worktree + separate process +print("Setting up $RELEASE_TAG worktree...") +flush(stdout) + +worktree_dir = mktempdir() +run(pipeline(`git -C $ROOT worktree add $worktree_dir $RELEASE_TAG`, stdout=devnull, stderr=devnull)) +println(" done") + +release_results_file = joinpath(worktree_dir, "_results.jls") + +release_script = joinpath(worktree_dir, "_bench.jl") +write(release_script, """ +using Pkg +Pkg.activate(; temp=true) +Pkg.develop(path=$(repr(worktree_dir))) +Pkg.add("BenchmarkTools") +Pkg.add("Serialization") + +using BenchmarkTools, Serialization, XML + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 + +small_xml = read($(repr(SMALL_FILE)), String) +medium_xml = read($(repr(MEDIUM_FILE)), String) +results = Dict{String, BenchmarkTools.Trial}() + +results["Parse (small)"] = @benchmark parse(\$small_xml, Node) + +try + SSNode = Node{SubString{String}} + results["Parse (small, SS)"] = @benchmark parse(\$small_xml, SSNode) + results["Parse (medium, SS)"] = @benchmark parse(\$medium_xml, SSNode) +catch +end + +results["Parse (medium)"] = @benchmark parse(\$medium_xml, Node) + +small_node = parse(small_xml, Node) +medium_node = parse(medium_xml, Node) +results["Write (small)"] = @benchmark XML.write(\$small_node) +results["Write (medium)"] = @benchmark XML.write(\$medium_node) +results["Read file (medium)"] = @benchmark read($(repr(MEDIUM_FILE)), Node) + +function _collect_tags!(out, node) + for c in XML.children(node) + if XML.nodetype(c) === XML.Element + push!(out, XML.tag(c)) + _collect_tags!(out, c) + end + end +end +function bench_collect_tags(node) + out = String[] + _collect_tags!(out, node) + out +end +results["Collect tags (small)"] = @benchmark bench_collect_tags(\$small_node) +results["Collect tags (medium)"] = @benchmark bench_collect_tags(\$medium_node) + +serialize($(repr(release_results_file)), results) +""") + +print("Running $RELEASE_TAG benchmarks...") +flush(stdout) +run(pipeline(`julia $release_script`, stdout=devnull, stderr=devnull)) +release_results = deserialize(release_results_file) +println(" done") + +# Cleanup worktree +run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull)) + +#-----------------------------------------------------------------------------# Write compare_results.md +_fmt_ms(t) = string(round(t, sigdigits=3), " ms") + +function _compare_indicator(dev_ms, rel_ms) + change = (dev_ms / rel_ms - 1) * 100 + pct = abs(round(change, digits=1)) + change < -5 ? "($(pct)% faster)" : change > 5 ? "($(pct)% slower)" : "(~same)" +end + +groups = [ + ("Parse (small)", "Parse (small)", ["Parse (small), String", "Parse (small), SubString"]), + ("Parse (medium)", "Parse (medium)", ["Parse (medium), String", "Parse (medium), SubString"]), + ("Write (small)", "Write (small)", ["Write (small)"]), + ("Write (medium)", "Write (medium)", ["Write (medium)"]), + ("Read file (medium)", "Read file (medium)", ["Read file (medium), String", "Read file (medium), SubString"]), + ("Collect tags (small)", "Collect tags (small)", ["Collect tags (small), String", "Collect tags (small), SubString"]), + ("Collect tags (medium)","Collect tags (medium)", ["Collect tags (medium), String", "Collect tags (medium), SubString"]), +] + +outfile = joinpath(@__DIR__, "compare_results.md") +open(outfile, "w") do io + println(io, "# XML.jl Benchmark Comparison: dev vs $RELEASE_TAG\n") + println(io, "```") + for (title, rel_key, dev_keys) in groups + rel_ms = haskey(release_results, rel_key) ? median(release_results[rel_key]).time / 1e6 : nothing + any(k -> haskey(dev_results, k), dev_keys) || (isnothing(rel_ms) && continue) + + println(io, title) + if !isnothing(rel_ms) + println(io, "\t", rpad(RELEASE_TAG, 16), lpad(_fmt_ms(rel_ms), 12)) + end + for dk in dev_keys + haskey(dev_results, dk) || continue + dev_ms = median(dev_results[dk]).time / 1e6 + label = occursin(", ", dk) ? split(dk, ", "; limit=2)[2] : "dev" + ms_str = lpad(_fmt_ms(dev_ms), 12) + if isnothing(rel_ms) + println(io, "\t", rpad(label, 16), ms_str) + else + println(io, "\t", rpad(label, 16), ms_str, " ", _compare_indicator(dev_ms, rel_ms)) + end + end + println(io) + end + println(io, "```") + + println(io, "\n```julia") + println(io, "versioninfo()") + buf = IOBuffer() + InteractiveUtils.versioninfo(buf) + for line in eachline(IOBuffer(take!(buf))) + println(io, "# ", line) + end + println(io, "```") +end + +println("Results written to $outfile") diff --git a/benchmarks/compare_results.md b/benchmarks/compare_results.md new file mode 100644 index 0000000..135035e --- /dev/null +++ b/benchmarks/compare_results.md @@ -0,0 +1,54 @@ +# XML.jl Benchmark Comparison: dev vs v0.3.8 + +``` +Parse (small) + v0.3.8 0.116 ms + String 0.0351 ms (69.7% faster) + SubString 0.0309 ms (73.4% faster) + +Parse (medium) + v0.3.8 736.0 ms + String 170.0 ms (76.9% faster) + SubString 162.0 ms (77.9% faster) + +Write (small) + v0.3.8 0.0257 ms + dev 0.0212 ms (17.7% faster) + +Write (medium) + v0.3.8 154.0 ms + dev 84.6 ms (44.9% faster) + +Read file (medium) + v0.3.8 714.0 ms + String 177.0 ms (75.2% faster) + SubString 171.0 ms (76.1% faster) + +Collect tags (small) + v0.3.8 0.000527 ms + String 0.000614 ms (16.5% slower) + SubString 0.00177 ms (235.1% slower) + +Collect tags (medium) + v0.3.8 25.0 ms + String 10.9 ms (56.4% faster) + SubString 16.0 ms (36.0% faster) + +``` + +```julia +versioninfo() +# Julia Version 1.12.5 +# Commit 5fe89b8ddc1 (2026-02-09 16:05 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/dict_benchmarks.jl b/benchmarks/dict_benchmarks.jl new file mode 100644 index 0000000..7dd90a3 --- /dev/null +++ b/benchmarks/dict_benchmarks.jl @@ -0,0 +1,71 @@ +using XML +using BenchmarkTools + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 + +#-----------------------------------------------------------------------------# Setup +sizes = [2, 5, 10, 20] + +function make_xml(n::Int) + attrs = join((" attr$i=\"value$i\"" for i in 1:n)) + "<root$attrs/>" +end + +function make_pairs(n::Int) + Pair{String,String}["attr$i" => "value$i" for i in 1:n] +end + +pt(t) = BenchmarkTools.prettytime(t) + +function printrow(n, op, t_dict, t_attr) + pct = round(100 * (t_dict - t_attr) / t_dict, digits=1) + label = pct > 0 ? "$(pct)% faster" : "$(-pct)% slower" + println(rpad("$n attrs", 10), " | ", rpad(op, 22), " | ", + rpad("Dict $(pt(t_dict))", 22), " | ", + rpad("Attributes $(pt(t_attr))", 26), " | ", label) +end + +#-----------------------------------------------------------------------------# Benchmarks +println("=" ^ 110) +println(" Attributes vs Dict Benchmarks") +println("=" ^ 110) +println(rpad("Size", 10), " | ", rpad("Operation", 22), " | ", + rpad("Dict", 22), " | ", rpad("Attributes", 26), " | Change") +println("-" ^ 110) + +for n in sizes + pairs = make_pairs(n) + d = Dict(pairs) + a = XML.Attributes(pairs) + key_mid = "attr$(n ÷ 2 + 1)" + key_last = "attr$n" + + tests = [ + ("construct", () -> @benchmark(Dict($pairs)), () -> @benchmark(XML.Attributes($pairs))), + ("getindex [mid]", () -> @benchmark($d[$key_mid]), () -> @benchmark($a[$key_mid])), + ("getindex [last]", () -> @benchmark($d[$key_last]), () -> @benchmark($a[$key_last])), + ("get [miss]", () -> @benchmark(get($d, "nope", nothing)), () -> @benchmark(get($a, "nope", nothing))), + ("haskey [hit]", () -> @benchmark(haskey($d, $key_mid)), () -> @benchmark(haskey($a, $key_mid))), + ("keys", () -> @benchmark(collect(keys($d))), () -> @benchmark(keys($a))), + ("iterate", () -> @benchmark(sum(length(v) for (_,v) in $d)), () -> @benchmark(sum(length(v) for (_,v) in $a))), + ] + + for (op, bench_dict, bench_attr) in tests + t_dict = median(bench_dict()).time + t_attr = median(bench_attr()).time + printrow(n, op, t_dict, t_attr) + end + println("-" ^ 110) +end + +#-----------------------------------------------------------------------------# End-to-end: attributes() call on parsed Node +println() +println(rpad("Size", 10), " | ", rpad("Operation", 22), " | Time") +println("-" ^ 50) +for n in sizes + doc = parse(make_xml(n), Node) + el = doc[1] + t = median(@benchmark(attributes($el))).time + println(rpad("$n attrs", 10), " | ", rpad("attributes(node)", 22), " | ", pt(t)) +end +println() diff --git a/benchmarks/suite.jl b/benchmarks/suite.jl deleted file mode 100644 index e06dc61..0000000 --- a/benchmarks/suite.jl +++ /dev/null @@ -1,74 +0,0 @@ -using Pkg -Pkg.activate(@__DIR__) - -using XML -using EzXML: EzXML -using XMLDict: XMLDict -using BenchmarkTools -using DataFrames -using UnicodePlots -using OrderedCollections: OrderedDict - - -BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 -BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000 - - -# nasa.xml was downloaded from: -# http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/www/repository.html#nasa -file = joinpath(@__DIR__, "nasa.xml") - -df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[]) - -macro add_benchmark(kind, name, expr...) - esc(:(let - @info string($kind, " - ", $name) - bench = @benchmark $(expr...) - push!(df, (; kind=$kind, name=$name, bench)) - end)) -end - -#-----------------------------------------------------------------------------# Write -@add_benchmark "Write" "XML.write" XML.write($(tempname()), o) setup = (o = read(file, Node)) -@add_benchmark "Write" "EzXML.writexml" EzXML.write($(tempname()), o) setup = (o = EzXML.readxml(file)) - -#-----------------------------------------------------------------------------# Read -@add_benchmark "Read" "XML.LazyNode" read($file, LazyNode) -@add_benchmark "Read" "XML.Node" read($file, Node) -@add_benchmark "Read" "EzXML.readxml" EzXML.readxml($file) -@add_benchmark "Read" "XMLDict.xml_dict" XMLDict.xml_dict(read($file, String)) - -#-----------------------------------------------------------------------------# Lazy Iteration -@add_benchmark "Lazy Iteration" "LazyNode" for x in read($file, LazyNode); end -@add_benchmark "Lazy Iteration" "EzXML.StreamReader" (reader = open(EzXML.StreamReader, $file); for x in reader; end; close(reader)) - -#-----------------------------------------------------------------------------# Lazy Iteration: Collect Tags -@add_benchmark "Collect Tags" "LazyNode" [tag(x) for x in o] setup = (o = read(file, LazyNode)) -@add_benchmark "Collect Tags" "EzXML.StreamReader" [r.name for x in r if x == EzXML.READER_ELEMENT] setup=(r=open(EzXML.StreamReader, file)) teardown=(close(r)) - -function get_tags(o::EzXML.Node) - out = String[] - for node in EzXML.eachelement(o) - push!(out, node.name) - for tag in get_tags(node) - push!(out, tag) - end - end - out -end -@add_benchmark "Collect Tags" "EzXML.readxml" get_tags(o.root) setup=(o = EzXML.readxml(file)) - - -#-----------------------------------------------------------------------------# Plots -function plot(df, kind) - g = groupby(df, :kind) - sub = g[(;kind)] - x = map(row -> "$(row.name)", eachrow(sub)) - y = map(x -> median(x).time / 1000^2, sub.bench) - display(barplot(x, y, title = "$kind Time (ms)", border=:none, width=50)) -end - -plot(df, "Read") -plot(df, "Write") -plot(df, "Lazy Iteration") -plot(df, "Collect Tags") diff --git a/src/XML.jl b/src/XML.jl index 273bfda..f36905f 100644 --- a/src/XML.jl +++ b/src/XML.jl @@ -1,31 +1,61 @@ module XML -using Mmap -using OrderedCollections: OrderedDict - export - # Core Types: - Node, LazyNode, - # Interface: - children, nodetype, tag, attributes, value, is_simple, simplevalue, simple_value, - # Extended Interface for LazyNode: - parent, depth, next, prev + Node, LazyNode, NodeType, Attributes, + CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text, + nodetype, tag, attributes, value, children, + is_simple, simple_value, + depth, siblings, + xpath, + h + +include("XMLTokenizer.jl") +using .XMLTokenizer: + tokenize, tag_name, attr_value, pi_target, TokenKind, Token, Tokenizer, TokenizerState, + TOKEN_TEXT, TOKEN_OPEN_TAG, TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, TOKEN_SELF_CLOSE, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE, + TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE, + TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE, + TOKEN_XML_DECL_OPEN, TOKEN_XML_DECL_CLOSE, + TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE #-----------------------------------------------------------------------------# escape/unescape -const escape_chars = ('&' => "&", '<' => "<", '>' => ">", "'" => "'", '"' => """) -function unescape(x::AbstractString) - result = x - for (pat, r) in reverse.(escape_chars) - result = replace(result, pat => r) - end - return result +const escape_chars = ('&' => "&", '<' => "<", '>' => ">", '\'' => "'", '"' => """) + +""" + escape(x::AbstractString) -> String + +Escape the five XML predefined entities: `&` `<` `>` `'` `"`. + +!!! note "Changed in v0.4" + `escape` is no longer idempotent. In previous versions, already-escaped sequences like + `&` were left untouched. Now every `&` is escaped, so `escape("&")` produces + `"&amp;"`. Call `escape` only on raw, unescaped text. +""" +escape(x::AbstractString) = replace(x, escape_chars...) + +# Replace a numeric character reference with its Unicode character. +# Numeric character references encode characters by code point: decimal (é → é) or hex (é → é). +function _unescape_charref(ref::AbstractString) + is_hex = length(ref) > 3 && ref[3] in ('x', 'X') + digits = SubString(ref, is_hex ? 4 : 3, length(ref) - 1) + cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10) + !isnothing(cp) && isvalid(Char, cp) ? string(Char(cp)) : ref end -function escape(x::String) - result = replace(x, r"&(?!amp;|quot;|apos;|gt;|lt;)" => "&") - for (pat, r) in escape_chars[2:end] - result = replace(result, pat => r) - end - return result + +""" + unescape(x::AbstractString) -> String + +Unescape XML entities in `x`: the five predefined entities (`&` `<` `>` `'` +`"`) and numeric character references (`{`, `«`). Each reference is processed +exactly once (no double-unescaping). +""" +function unescape(x::AbstractString) + occursin('&', x) || return string(x) + s = string(x) + occursin("&#", s) && (s = replace(s, r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref)) + replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&") end #-----------------------------------------------------------------------------# NodeType @@ -34,9 +64,9 @@ end - Document # prolog & root Element - DTD # <!DOCTYPE ...> - Declaration # <?xml attributes... ?> - - ProcessingInstruction # <?NAME attributes... ?> + - ProcessingInstruction # <?NAME content... ?> - Comment # <!-- ... --> - - CData # <![CData[...]]> + - CData # <![CDATA[...]]> - Element # <NAME attributes... > children... </NAME> - Text # text @@ -45,381 +75,952 @@ NodeTypes can be used to construct XML.Nodes: Document(children...) DTD(value) Declaration(; attributes) - ProcessingInstruction(tag, attributes) + ProcessingInstruction(tag, content) Comment(text) CData(text) Element(tag, children...; attributes) Text(text) """ -@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text) +@enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text +#-----------------------------------------------------------------------------# Attributes +""" + Attributes{S} <: AbstractDict{S, S} -#-----------------------------------------------------------------------------# includes -include("raw.jl") -include("dtd.jl") +An ordered dictionary of XML attributes backed by a `Vector{Pair{S, S}}`. +Returned by [`attributes`](@ref). Preserves insertion order and supports the +full `AbstractDict` interface (`get`, `haskey`, `keys`, `values`, iteration, etc.). +""" +struct Attributes{S} <: AbstractDict{S, S} + entries::Vector{Pair{S, S}} +end -abstract type AbstractXMLNode end +Base.length(a::Attributes) = length(a.entries) +Base.iterate(a::Attributes, state...) = iterate(a.entries, state...) -#-----------------------------------------------------------------------------# LazyNode -""" - LazyNode(file::AbstractString) - LazyNode(data::XML.Raw) +function Base.getindex(a::Attributes, key::AbstractString) + for (k, v) in a.entries + k == key && return v + end + throw(KeyError(key)) +end -A Lazy representation of an XML node. -""" -mutable struct LazyNode <: AbstractXMLNode - raw::Raw - tag::Union{Nothing, String} - attributes::Union{Nothing, OrderedDict{String, String}} - value::Union{Nothing, String} +function Base.get(a::Attributes, key::AbstractString, default) + for (k, v) in a.entries + k == key && return v + end + default end -LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing) -function Base.getproperty(o::LazyNode, x::Symbol) - x === :raw && return getfield(o, :raw) - x === :nodetype && return nodetype(o.raw) - x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x) - x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x) - x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x) - x === :depth && return depth(o.raw) - x === :children && return LazyNode.(children(o.raw)) - error("type LazyNode has no field $(x)") +function Base.haskey(a::Attributes, key::AbstractString) + any(p -> first(p) == key, a.entries) +end + +Base.keys(a::Attributes) = first.(a.entries) +Base.values(a::Attributes) = last.(a.entries) + +#-----------------------------------------------------------------------------# Node +struct Node{S} + nodetype::NodeType + tag::Union{Nothing, S} + attributes::Union{Nothing, Vector{Pair{S, S}}} + value::Union{Nothing, S} + children::Union{Nothing, Vector{Node{S}}} + + function Node{S}(nodetype::NodeType, tag, attributes, value, children) where {S} + if nodetype in (Text, Comment, CData, DTD) + isnothing(tag) && isnothing(attributes) && !isnothing(value) && isnothing(children) || + error("$nodetype nodes only accept a value.") + elseif nodetype === Element + !isnothing(tag) && isnothing(value) || + error("Element nodes require a tag and no value.") + elseif nodetype === Declaration + isnothing(tag) && isnothing(value) && isnothing(children) || + error("Declaration nodes only accept attributes.") + elseif nodetype === ProcessingInstruction + !isnothing(tag) && isnothing(attributes) && isnothing(children) || + error("ProcessingInstruction nodes require a tag and only accept a value.") + elseif nodetype === Document + isnothing(tag) && isnothing(attributes) && isnothing(value) || + error("Document nodes only accept children.") + end + new{S}(nodetype, tag, attributes, value, children) + end end -Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children) -Base.show(io::IO, o::LazyNode) = _show_node(io, o) +#-----------------------------------------------------------------------------# interface +nodetype(o::Node) = o.nodetype +tag(o::Node) = o.tag + +""" + attributes(node::Node) -> Union{Nothing, Attributes{String}} + +Return the attributes of an `Element` or `Declaration` node as an [`Attributes`](@ref) dict, +or `nothing` if the node has no attributes. -Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw)) -Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw)) -Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw)) +!!! note "Changed in v0.4" + In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl. + It now returns an [`Attributes`](@ref), an ordered `AbstractDict` backed by a + `Vector{Pair}`. +""" +attributes(o::Node) = isnothing(o.attributes) ? nothing : Attributes(o.attributes) + +value(o::Node) = o.value +children(o::Node) = something(o.children, ()) + +is_simple(o::Node) = o.nodetype === Element && + (isnothing(o.attributes) || isempty(o.attributes)) && + !isnothing(o.children) && length(o.children) == 1 && + o.children[1].nodetype in (Text, CData) + +simple_value(o::Node) = is_simple(o) ? o.children[1].value : + error("`simple_value` is only defined for simple nodes.") + +#-----------------------------------------------------------------------------# tree navigation + +""" + parent(child::Node, root::Node) -> Node -children(o::LazyNode) = LazyNode.(children(o.raw)) -parent(o::LazyNode) = LazyNode(parent(o.raw)) -depth(o::LazyNode) = depth(o.raw) +Return the parent of `child` within the tree rooted at `root`. -Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown() -Base.eltype(::Type{LazyNode}) = LazyNode +Since `Node` does not store parent pointers, this performs a tree search from `root`. +Throws an error if `child` is not found or if `child === root`. +""" +function Base.parent(child::Node, root::Node) + child === root && error("Root node has no parent.") + result = _find_parent(child, root) + isnothing(result) && error("Node not found in tree.") + result +end -function Base.iterate(o::LazyNode, state=o) - n = next(state) - return isnothing(n) ? nothing : (n, n) +function _find_parent(child::Node, current::Node) + for c in children(current) + c === child && return current + result = _find_parent(child, c) + isnothing(result) || return result + end + nothing end -function next(o::LazyNode) - n = next(o.raw) - isnothing(n) && return nothing - n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n) +""" + depth(child::Node, root::Node) -> Int + +Return the depth of `child` within the tree rooted at `root` (root has depth 0). + +Since `Node` does not store parent pointers, this performs a tree search from `root`. +Throws an error if `child` is not found in the tree. +""" +function depth(child::Node, root::Node) + child === root && return 0 + result = _find_depth(child, root, 0) + isnothing(result) && error("Node not found in tree.") + result end -function prev(o::LazyNode) - n = prev(o.raw) - isnothing(n) && return nothing - n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n) + +function _find_depth(child::Node, current::Node, d::Int) + for c in children(current) + c === child && return d + 1 + result = _find_depth(child, c, d + 1) + isnothing(result) || return result + end + nothing end -#-----------------------------------------------------------------------------# Node """ - Node(nodetype, tag, attributes, value, children) - Node(node::Node; kw...) # copy node with keyword overrides - Node(node::LazyNode) # un-lazy the LazyNode + siblings(child::Node, root::Node) -> Vector{Node} + +Return the siblings of `child` (other children of the same parent) within the tree rooted +at `root`. The returned vector does not include `child` itself. -A representation of an XML DOM node. For simpler construction, use `(::NodeType)(args...)` +Throws an error if `child` is the root or is not found in the tree. """ -struct Node <: AbstractXMLNode - nodetype::NodeType - tag::Union{Nothing, String} - attributes::Union{Nothing, OrderedDict{String, String}} - value::Union{Nothing, String} - children::Union{Nothing, Vector{Node}} - - function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing) - new(nodetype, - isnothing(tag) ? nothing : string(tag), - isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)), - isnothing(value) ? nothing : string(value), - isnothing(children) ? nothing : - children isa Node ? [children] : - children isa Vector{Node} ? children : - children isa Vector ? map(Node, children) : - children isa Tuple ? map(Node, collect(children)) : - [Node(children)] - ) +function siblings(child::Node, root::Node) + p = parent(child, root) + [c for c in children(p) if c !== child] +end + +include("xpath.jl") +include("lazynode.jl") + + +#-----------------------------------------------------------------------------# _to_node +_to_node(n::Node{String}) = n +_to_node(n::Node) = throw(ArgumentError("Expected Node{String}, got $(typeof(n))")) +_to_node(x) = Node{String}(Text, nothing, nothing, string(x), nothing) + +#-----------------------------------------------------------------------------# NodeType constructors +function (T::NodeType)(args...; attrs...) + S = String + if T in (Text, Comment, CData, DTD) + length(args) == 1 || error("$T nodes require exactly one value argument.") + !isempty(attrs) && error("$T nodes do not accept attributes.") + Node{S}(T, nothing, nothing, string(only(args)), nothing) + elseif T === Element + isempty(args) && error("Element nodes require at least a tag.") + t = string(first(args)) + a = Pair{S,S}[String(k) => String(v) for (k, v) in pairs(attrs)] + c = Node{S}[_to_node(x) for x in args[2:end]] + Node{S}(T, t, a, nothing, c) + elseif T === Declaration + !isempty(args) && error("Declaration nodes only accept keyword attributes.") + a = isempty(attrs) ? nothing : [String(k) => String(v) for (k, v) in pairs(attrs)] + Node{S}(T, nothing, a, nothing, nothing) + elseif T === ProcessingInstruction + length(args) >= 1 || error("ProcessingInstruction nodes require a target.") + length(args) <= 2 || error("ProcessingInstruction nodes accept a target and optional content.") + !isempty(attrs) && error("ProcessingInstruction nodes do not accept attributes.") + t = string(args[1]) + v = length(args) == 2 ? string(args[2]) : nothing + Node{S}(T, t, nothing, v, nothing) + elseif T === Document + !isempty(attrs) && error("Document nodes do not accept attributes.") + c = Node{S}[_to_node(x) for x in args] + Node{S}(T, nothing, nothing, nothing, c) end end -function Node(o::Node, x...; kw...) - attrs = !isnothing(kw) ? - merge( - OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)), - isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes - ) : - o.attributes - children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x)) - Node(o.nodetype, o.tag, attrs, o.value, children) +#-----------------------------------------------------------------------------# equality +_eq(::Nothing, ::Nothing) = true +_eq(::Nothing, b) = isempty(b) +_eq(a, ::Nothing) = isempty(a) +_eq(a, b) = a == b + +# Attribute equality is order-insensitive per XML spec +function _attrs_eq(a, b) + a_empty = isnothing(a) || isempty(a) + b_empty = isnothing(b) || isempty(b) + a_empty && b_empty && return true + (a_empty != b_empty) && return false + length(a) != length(b) && return false + for p in a + p in b || return false + end + true end -function Node(node::LazyNode) - nodetype = node.nodetype - tag = node.tag - attributes = node.attributes - value = node.value - c = XML.children(node) - Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c)) +function Base.:(==)(a::Node, b::Node) + a.nodetype == b.nodetype && + a.tag == b.tag && + _attrs_eq(a.attributes, b.attributes) && + a.value == b.value && + _eq(a.children, b.children) end -Node(data::Raw) = Node(LazyNode(data)) +#-----------------------------------------------------------------------------# indexing +Base.getindex(o::Node, i::Integer) = children(o)[i] +Base.getindex(o::Node, ::Colon) = children(o) +Base.lastindex(o::Node) = lastindex(children(o)) +Base.only(o::Node) = only(children(o)) +Base.length(o::Node) = length(children(o)) -# Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node -Node(x) = Node(Text, nothing, nothing, string(x), nothing) +function Base.get(o::Node, key::AbstractString, default) + isnothing(o.attributes) && return default + for (k, v) in o.attributes + k == key && return v + end + default +end -h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children) -Base.getproperty(::typeof(h), tag::Symbol) = h(tag) -(o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...) +const _MISSING_ATTR = gensym(:missing_attr) -# NOT in-place for Text Nodes -function escape!(o::Node, warn::Bool=true) - if o.nodetype == Text - warn && @warn "escape!() called on a Text Node creates a new node." - return Text(escape(o.value)) +function Base.getindex(o::Node, key::AbstractString) + val = get(o, key, _MISSING_ATTR) + val === _MISSING_ATTR && throw(KeyError(key)) + val +end + +function Base.haskey(o::Node, key::AbstractString) + get(o, key, _MISSING_ATTR) !== _MISSING_ATTR +end + +Base.keys(o::Node) = isnothing(o.attributes) ? () : first.(o.attributes) + +#-----------------------------------------------------------------------------# mutation +function Base.setindex!(o::Node, val, i::Integer) + isnothing(o.children) && error("Node has no children.") + o.children[i] = _to_node(val) +end + +function Base.setindex!(o::Node, val, key::AbstractString) + isnothing(o.attributes) && error("Node has no attributes.") + v = string(val) + for i in eachindex(o.attributes) + if first(o.attributes[i]) == key + o.attributes[i] = key => v + return v + end end - isnothing(o.children) && return o - map!(x -> escape!(x, false), o.children, o.children) - o + push!(o.attributes, key => v) + v end -function unescape!(o::Node, warn::Bool=true) - if o.nodetype == Text - warn && @warn "unescape!() called on a Text Node creates a new node." - return Text(unescape(o.value)) + +function Base.push!(a::Node, b) + isnothing(a.children) && error("Node does not accept children.") + push!(a.children, _to_node(b)) + a +end + +function Base.pushfirst!(a::Node, b) + isnothing(a.children) && error("Node does not accept children.") + pushfirst!(a.children, _to_node(b)) + a +end + +#-----------------------------------------------------------------------------# show (REPL) +function Base.show(io::IO, o::Node) + nt = o.nodetype + print(io, nt) + if nt === Text + print(io, ' ', repr(o.value)) + elseif nt === Element + print(io, " <", o.tag) + if !isnothing(o.attributes) + for (k, v) in o.attributes + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, '>') + n = length(children(o)) + n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)") + elseif nt === DTD + print(io, " <!DOCTYPE ", o.value, '>') + elseif nt === Declaration + print(io, " <?xml") + if !isnothing(o.attributes) + for (k, v) in o.attributes + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, "?>") + elseif nt === ProcessingInstruction + print(io, " <?", o.tag) + !isnothing(o.value) && print(io, ' ', o.value) + print(io, "?>") + elseif nt === Comment + print(io, " <!--", o.value, "-->") + elseif nt === CData + print(io, " <![CDATA[", o.value, "]]>") + elseif nt === Document + n = length(children(o)) + n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)") + end +end + +#-----------------------------------------------------------------------------# show (text/xml) + +# Write XML-escaped content directly to IO (single pass, no intermediate string) +function _write_escaped(io::IO, s::String) + start = 1 + i = 1 + n = ncodeunits(s) + @inbounds while i <= n + b = codeunit(s, i) + esc = if b == UInt8('&'); "&" + elseif b == UInt8('<'); "<" + elseif b == UInt8('>'); ">" + elseif b == UInt8('"'); """ + elseif b == UInt8('\''); "'" + else + i += 1 + continue + end + i > start && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (i - start) % UInt) + print(io, esc) + i += 1 + start = i end - isnothing(o.children) && return o - map!(x -> unescape!(x, false), o.children, o.children) - o + start <= n && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (n - start + 1) % UInt) + nothing end +# Cached indentation strings to avoid repeated allocation +const _MAX_CACHED_INDENT = 64 +const _INDENT_STRINGS = [" " ^ n for n in 0:_MAX_CACHED_INDENT] +@inline function _indent_str(n::Int) + 0 <= n <= _MAX_CACHED_INDENT && return @inbounds _INDENT_STRINGS[n + 1] + " " ^ n +end + +function _print_attrs(io::IO, attributes) + isnothing(attributes) && return + for (k, v) in attributes + print(io, ' ', k, "=\"") + _write_escaped(io, v) + print(io, '"') + end +end -Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw)) -Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw)) -Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw)) +function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false) + pad = preserve ? "" : _indent_str(indent * depth) + nt = node.nodetype + if nt === Text + _write_escaped(io, node.value) + elseif nt === Element + # Check xml:space on this element + child_preserve = preserve + if !isnothing(node.attributes) + for (k, v) in node.attributes + k == "xml:space" && (child_preserve = v == "preserve") + end + end + print(io, pad, '<', node.tag) + _print_attrs(io, node.attributes) + ch = node.children + if isnothing(ch) || isempty(ch) + print(io, "/>") + elseif length(ch) == 1 && only(ch).nodetype === Text + print(io, '>') + _write_xml(io, only(ch), 0, 0, child_preserve) + print(io, "</", node.tag, '>') + else + child_preserve ? print(io, '>') : println(io, '>') + for child in ch + _write_xml(io, child, depth + 1, indent, child_preserve) + child_preserve || println(io) + end + print(io, child_preserve ? "" : pad, "</", node.tag, '>') + end + elseif nt === Declaration + print(io, pad, "<?xml") + _print_attrs(io, node.attributes) + print(io, "?>") + elseif nt === ProcessingInstruction + print(io, pad, "<?", node.tag) + isnothing(node.value) || print(io, ' ', node.value) + print(io, "?>") + elseif nt === Comment + print(io, pad, "<!--", node.value, "-->") + elseif nt === CData + print(io, pad, "<![CDATA[", node.value, "]]>") + elseif nt === DTD + print(io, pad, "<!DOCTYPE ", node.value, '>') + elseif nt === Document + ch = node.children + if !isnothing(ch) + for (i, child) in enumerate(ch) + _write_xml(io, child, 0, indent, preserve) + i < length(ch) && println(io) + end + end + end +end -Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val) -Base.push!(a::Node, b::Node) = push!(a.children, b) -Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b) +Base.show(io::IO, ::MIME"text/xml", node::Node) = _write_xml(io, node) -Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val)) -Base.getindex(o::Node, val::AbstractString) = o.attributes[val] -Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key) -Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes) +#-----------------------------------------------------------------------------# write / read +write(node::Node; indentsize::Int=2) = (io = IOBuffer(); _write_xml(io, node, 0, indentsize); String(take!(io))) +write(filename::AbstractString, node::Node; kw...) = open(io -> write(io, node; kw...), filename, "w") +write(io::IO, node::Node; indentsize::Int=2) = _write_xml(io, node, 0, indentsize) -Base.show(io::IO, o::Node) = _show_node(io, o) +Base.read(filename::AbstractString, ::Type{Node}) = parse(read(filename, String), Node) +Base.read(io::IO, ::Type{Node}) = parse(read(io, String), Node) -#-----------------------------------------------------------------------------# Node Constructors -function (T::NodeType)(args...; attr...) - if T === Document - !isempty(attr) && error("Document nodes do not have attributes.") - Node(T, nothing, nothing, nothing, args) - elseif T === DTD - !isempty(attr) && error("DTD nodes only accept a value.") - length(args) > 1 && error("DTD nodes only accept a value.") - Node(T, nothing, nothing, only(args)) - elseif T === Declaration - !isempty(args) && error("Declaration nodes only accept attributes") - Node(T, nothing, attr) - elseif T === ProcessingInstruction - length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.") - Node(T, only(args), attr) - elseif T === Comment - !isempty(attr) && error("Comment nodes do not have attributes.") - length(args) > 1 && error("Comment nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === CData - !isempty(attr) && error("CData nodes do not have attributes.") - length(args) > 1 && error("CData nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === Text - !isempty(attr) && error("Text nodes do not have attributes.") - length(args) > 1 && error("Text nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === Element - tag = first(args) - Node(T, tag, attr, nothing, args[2:end]) - else - error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).") +#-----------------------------------------------------------------------------# parse +Base.parse(::Type{Node}, xml::AbstractString) = parse(xml, Node) + +function Base.parse(xml::AbstractString, ::Type{Node}) + _parse(String(xml), String, unescape) +end + +function Base.parse(xml::AbstractString, ::Type{Node{SubString{String}}}) + _parse(String(xml), SubString{String}, identity) +end + +_to(::Type{String}, s::AbstractString) = String(s) +_to(::Type{SubString{String}}, s::SubString{String}) = s + +_nothingify(v::Vector) = isempty(v) ? nothing : v + +function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F} + tags = S[] + attrs_stack = Vector{Pair{S,S}}[] + children_stack = Vector{Vector{Node{S}}}() + push!(children_stack, Node{S}[]) + + pending_attr_name = SubString(xml, 1, 0) + decl_attrs = nothing + pending_pi_tag = SubString(xml, 1, 0) + pending_pi_value = nothing + in_close_tag = false + + for token in tokenize(xml) + k = token.kind + + if k === TOKEN_TEXT + push!(last(children_stack), Node{S}(Text, nothing, nothing, convert_text(token.raw), nothing)) + + elseif k === TOKEN_OPEN_TAG + push!(tags, _to(S, tag_name(token))) + push!(attrs_stack, Pair{S,S}[]) + push!(children_stack, Node{S}[]) + + elseif k === TOKEN_SELF_CLOSE + t = pop!(tags) + a = pop!(attrs_stack) + pop!(children_stack) + push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, nothing)) + + elseif k === TOKEN_TAG_CLOSE + in_close_tag && (in_close_tag = false) + + elseif k === TOKEN_CLOSE_TAG + close_name = tag_name(token) + isempty(tags) && error("Closing tag </$close_name> with no matching open tag.") + t = pop!(tags) + t == close_name || error("Mismatched tags: expected </$t>, got </$close_name>.") + a = pop!(attrs_stack) + c = pop!(children_stack) + push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, isempty(c) ? nothing : c)) + in_close_tag = true + + elseif k === TOKEN_ATTR_NAME + pending_attr_name = token.raw + + elseif k === TOKEN_ATTR_VALUE + val = convert_text(attr_value(token)) + name = _to(S, pending_attr_name) + if decl_attrs !== nothing + any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name") + push!(decl_attrs, name => val) + elseif !isempty(attrs_stack) + any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name") + push!(last(attrs_stack), name => val) + end + + elseif k === TOKEN_XML_DECL_OPEN + decl_attrs = Pair{S,S}[] + + elseif k === TOKEN_XML_DECL_CLOSE + a = isempty(decl_attrs) ? nothing : decl_attrs + push!(last(children_stack), Node{S}(Declaration, nothing, a, nothing, nothing)) + decl_attrs = nothing + + elseif k === TOKEN_COMMENT_CONTENT + push!(last(children_stack), Node{S}(Comment, nothing, nothing, _to(S, token.raw), nothing)) + + elseif k === TOKEN_CDATA_CONTENT + push!(last(children_stack), Node{S}(CData, nothing, nothing, _to(S, token.raw), nothing)) + + elseif k === TOKEN_DOCTYPE_CONTENT + push!(last(children_stack), Node{S}(DTD, nothing, nothing, _to(S, lstrip(token.raw)), nothing)) + + elseif k === TOKEN_PI_OPEN + pending_pi_tag = pi_target(token) + pending_pi_value = nothing + + elseif k === TOKEN_PI_CONTENT + content = strip(token.raw) + pending_pi_value = isempty(content) ? nothing : _to(S, content) + + elseif k === TOKEN_PI_CLOSE + push!(last(children_stack), Node{S}(ProcessingInstruction, _to(S, pending_pi_tag), nothing, pending_pi_value, nothing)) + end end + + !isempty(tags) && error("Unclosed tags: $(join(tags, ", "))") + doc_children = only(children_stack) + Node{S}(Document, nothing, nothing, nothing, isempty(doc_children) ? nothing : doc_children) end -#-----------------------------------------------------------------------------# !!! common !!! -# Everything below here is common to all data structures +#-----------------------------------------------------------------------------# h (HTML/XML element builder) +""" + h(tag, children...; attrs...) + h.tag(children...; attrs...) +Convenience constructor for `Element` nodes. -#-----------------------------------------------------------------------------# interface fallbacks -nodetype(o) = o.nodetype -tag(o) = o.tag -attributes(o) = o.attributes -value(o) = o.value -children(o::T) where {T} = isnothing(o.children) ? () : o.children + h("div", "hello"; class="main") # <div class="main">hello</div> + h.div("hello"; class="main") # same thing +""" +function h(tag::Union{Symbol, AbstractString}, children...; attrs...) + t = String(tag) + a = Pair{String,String}[String(k) => String(v) for (k, v) in pairs(attrs)] + c = Node{String}[_to_node(x) for x in children] + Node{String}(Element, t, a, nothing, c) +end -depth(o) = missing -parent(o) = missing -next(o) = missing -prev(o) = missing +Base.getproperty(::typeof(h), tag::Symbol) = h(tag) -is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) && - length(children(o)) == 1 && nodetype(only(o)) in (Text, CData) +function (o::Node)(args...; attrs...) + o.nodetype === Element || error("Only Element nodes are callable.") + old_children = something(o.children, ()) + old_attrs = isnothing(o.attributes) ? () : (Symbol(k) => v for (k, v) in o.attributes) + h(o.tag, old_children..., args...; old_attrs..., attrs...) +end + +#-----------------------------------------------------------------------------# DTD parsing +struct ElementDecl + name::String + content::String # "EMPTY", "ANY", or content model like "(#PCDATA)" or "(a,b,c)*" +end -simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.") +struct AttDecl + element::String + name::String + type::String # "CDATA", "ID", "(val1|val2)", "NOTATION (a|b)", etc. + default::String # "#REQUIRED", "#IMPLIED", "#FIXED \"val\"", or "\"val\"" +end -Base.@deprecate_binding simplevalue simple_value +struct EntityDecl + name::String + value::Union{Nothing, String} # replacement text (internal entities) + external_id::Union{Nothing, String} # "SYSTEM \"uri\"" or "PUBLIC \"pubid\" \"uri\"" + parameter::Bool +end -#-----------------------------------------------------------------------------# nodes_equal -function nodes_equal(a, b) - out = XML.tag(a) == XML.tag(b) - out &= XML.nodetype(a) == XML.nodetype(b) - out &= XML.attributes(a) == XML.attributes(b) - out &= XML.value(a) == XML.value(b) - out &= length(XML.children(a)) == length(XML.children(b)) - out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b))) - return out +struct NotationDecl + name::String + external_id::String end -Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b) +struct ParsedDTD + root::String + system_id::Union{Nothing, String} + public_id::Union{Nothing, String} + elements::Vector{ElementDecl} + attributes::Vector{AttDecl} + entities::Vector{EntityDecl} + notations::Vector{NotationDecl} +end -#-----------------------------------------------------------------------------# parse -Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T) +# DTD parsing helpers +@inline _dtd_is_name_char(c::Char) = + ('a' <= c <= 'z') || ('A' <= c <= 'Z') || ('0' <= c <= '9') || + c == '_' || c == '-' || c == '.' || c == ':' -#-----------------------------------------------------------------------------# indexing -Base.getindex(o::Union{Raw, AbstractXMLNode}) = o -Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i] -Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o) -Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o)) - -Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o)) - -Base.length(o::AbstractXMLNode) = length(children(o)) - -#-----------------------------------------------------------------------------# printing -function _show_node(io::IO, o) - printstyled(io, typeof(o), ' '; color=:light_black) - !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black) - printstyled(io, nodetype(o), ; color=:light_green) - if o.nodetype === Text - printstyled(io, ' ', repr(value(o))) - elseif o.nodetype === Element - printstyled(io, " <", tag(o), color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, '>', color=:light_cyan) - _print_n_children(io, o) - elseif o.nodetype === DTD - printstyled(io, " <!DOCTYPE "; color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, '>', color=:light_cyan) - elseif o.nodetype === Declaration - printstyled(io, " <?xml", color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, "?>", color=:light_cyan) - elseif o.nodetype === ProcessingInstruction - printstyled(io, " <?", tag(o), color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, "?>", color=:light_cyan) - elseif o.nodetype === Comment - printstyled(io, " <!--", color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, "-->", color=:light_cyan) - elseif o.nodetype === CData - printstyled(io, " <![CData[", color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, "]]>", color=:light_cyan) - elseif o.nodetype === Document - _print_n_children(io, o) - elseif o.nodetype === UNKNOWN - printstyled(io, "Unknown", color=:light_cyan) - _print_n_children(io, o) - else - error("Unreachable reached") +function _dtd_skip_ws(s, pos) + while pos <= ncodeunits(s) && isspace(s[pos]) + pos += 1 end + pos end -function _print_attrs(io::IO, o; color=:normal) - attr = attributes(o) - isnothing(attr) && return nothing - for (k,v) in attr - # printstyled(io, ' ', k, '=', '"', v, '"'; color) - print(io, ' ', k, '=', '"', v, '"') +function _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + start = pos + while pos <= ncodeunits(s) && _dtd_is_name_char(s[pos]) + pos += 1 end + start == pos && error("Expected name at position $pos in DTD") + SubString(s, start, pos - 1), pos end -function _print_n_children(io::IO, o::Node) - n = length(children(o)) - text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)" - printstyled(io, text, color=:light_black) -end -_print_n_children(io::IO, o) = nothing - -#-----------------------------------------------------------------------------# write_xml -write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io))) - -write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w") - -function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1) - indent = ' ' ^ indentsize - nodetype = XML.nodetype(x) - tag = XML.tag(x) - value = XML.value(x) - children = XML.children(x) - - padding = indent ^ max(0, depth - 1) - !ctx[end] && print(io, padding) - - if nodetype === Text - print(io, value) - - elseif nodetype === Element - push!(ctx, ctx[end]) - update_ctx!(ctx, x) - print(io, '<', tag) - _print_attrs(io, x) - print(io, isempty(children) ? '/' : "", '>') - if !isempty(children) - if length(children) == 1 && XML.nodetype(only(children)) === Text - write(io, only(children), ctx; indentsize=0) - print(io, "</", tag, '>') - else - !ctx[end] && println(io) - foreach(children) do child - write(io, child, ctx; indentsize, depth=depth + 1) - !ctx[end] && println(io) - end - print(io, !ctx[end] ? padding : "", "</", tag, '>') + +function _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + q = s[pos] + (q == '"' || q == '\'') || error("Expected quoted string at position $pos in DTD") + pos += 1 + start = pos + while pos <= ncodeunits(s) && s[pos] != q + pos += 1 + end + val = SubString(s, start, pos - 1) + pos += 1 + val, pos +end + +function _dtd_read_parens(s, pos) + pos = _dtd_skip_ws(s, pos) + s[pos] == '(' || error("Expected '(' at position $pos in DTD") + depth = 1 + start = pos + pos += 1 + while pos <= ncodeunits(s) && depth > 0 + c = s[pos] + if c == '(' + depth += 1 + elseif c == ')' + depth -= 1 + elseif c == '"' || c == '\'' + pos += 1 + while pos <= ncodeunits(s) && s[pos] != c + pos += 1 end end - pop!(ctx) + pos += 1 + end + SubString(s, start, pos - 1), pos +end - elseif nodetype === DTD - print(io, "<!DOCTYPE ", value, '>') +function _dtd_skip_to_close(s, pos) + while pos <= ncodeunits(s) && s[pos] != '>' + c = s[pos] + if c == '"' || c == '\'' + pos += 1 + while pos <= ncodeunits(s) && s[pos] != c + pos += 1 + end + end + pos += 1 + end + pos <= ncodeunits(s) ? pos + 1 : pos +end - elseif nodetype === Declaration - print(io, "<?xml") - _print_attrs(io, x) - print(io, "?>") +function _dtd_parse_element(s, pos) + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if s[pos] == '(' + content, pos = _dtd_read_parens(s, pos) + if pos <= ncodeunits(s) && s[pos] in ('*', '+', '?') + content = string(content, s[pos]) + pos += 1 + end + else + content, pos = _dtd_read_name(s, pos) + end + pos = _dtd_skip_to_close(s, pos) + ElementDecl(String(name), String(content)), pos +end - elseif nodetype === ProcessingInstruction - print(io, "<?", tag) - _print_attrs(io, x) - print(io, "?>") +function _dtd_parse_attlist(s, pos) + element, pos = _dtd_read_name(s, pos) + atts = AttDecl[] + while true + pos = _dtd_skip_ws(s, pos) + (pos > ncodeunits(s) || s[pos] == '>') && break + + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + + # Attribute type + if s[pos] == '(' + atype, pos = _dtd_read_parens(s, pos) + else + atype, pos = _dtd_read_name(s, pos) + if atype == "NOTATION" + pos = _dtd_skip_ws(s, pos) + parens, pos = _dtd_read_parens(s, pos) + atype = string("NOTATION ", parens) + end + end + pos = _dtd_skip_ws(s, pos) - elseif nodetype === Comment - print(io, "<!--", value, "-->") + # Default declaration + if s[pos] == '#' + pos += 1 + keyword, pos = _dtd_read_name(s, pos) + if keyword == "FIXED" + pos = _dtd_skip_ws(s, pos) + val, pos = _dtd_read_quoted(s, pos) + default = string("#FIXED \"", val, "\"") + else + default = string("#", keyword) + end + elseif s[pos] == '"' || s[pos] == '\'' + val, pos = _dtd_read_quoted(s, pos) + default = string("\"", val, "\"") + else + error("Expected default declaration at position $pos in DTD") + end + push!(atts, AttDecl(String(element), String(name), String(atype), default)) + end + pos <= ncodeunits(s) && s[pos] == '>' && (pos += 1) + atts, pos +end - elseif nodetype === CData - print(io, "<![CData[", value, "]]>") +function _dtd_parse_entity(s, pos) + pos = _dtd_skip_ws(s, pos) + parameter = false + if pos <= ncodeunits(s) && s[pos] == '%' + parameter = true + pos += 1 + end + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) - elseif nodetype === Document - foreach(children) do child - write(io, child, ctx; indentsize) - !ctx[end] && println(io) + value = nothing + external_id = nothing + if s[pos] == '"' || s[pos] == '\'' + v, pos = _dtd_read_quoted(s, pos) + value = String(v) + else + keyword, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if keyword == "SYSTEM" + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("SYSTEM \"", uri, "\"") + elseif keyword == "PUBLIC" + pubid, pos = _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"") + else + error("Expected SYSTEM, PUBLIC, or quoted value in ENTITY declaration") end + end + pos = _dtd_skip_to_close(s, pos) + EntityDecl(String(name), value, external_id, parameter), pos +end +function _dtd_parse_notation(s, pos) + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + keyword, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if keyword == "SYSTEM" + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("SYSTEM \"", uri, "\"") + elseif keyword == "PUBLIC" + pubid, pos = _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'') + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"") + else + external_id = string("PUBLIC \"", pubid, "\"") + end else - error("Unreachable case reached during XML.write") + error("Expected SYSTEM or PUBLIC in NOTATION declaration") + end + pos = _dtd_skip_to_close(s, pos) + NotationDecl(String(name), external_id), pos +end + +""" + parse_dtd(value::AbstractString) -> ParsedDTD + parse_dtd(node::Node) -> ParsedDTD + +Parse a DTD value string (from a `DTD` node) into structured declarations. +""" +function parse_dtd(value::AbstractString) + s = String(value) + pos = 1 + + root, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + + # External ID + system_id = nothing + public_id = nothing + if pos <= ncodeunits(s) && _dtd_is_name_char(s[pos]) + keyword, kpos = _dtd_read_name(s, pos) + if keyword == "SYSTEM" + pos = kpos + uri, pos = _dtd_read_quoted(s, pos) + system_id = String(uri) + elseif keyword == "PUBLIC" + pos = kpos + pubid, pos = _dtd_read_quoted(s, pos) + public_id = String(pubid) + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'') + uri, pos = _dtd_read_quoted(s, pos) + system_id = String(uri) + end + end + end + + elements = ElementDecl[] + attributes = AttDecl[] + entities = EntityDecl[] + notations = NotationDecl[] + + # Internal subset + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && s[pos] == '[' + pos += 1 + while pos <= ncodeunits(s) + pos = _dtd_skip_ws(s, pos) + pos > ncodeunits(s) && break + s[pos] == ']' && break + + rest = SubString(s, pos) + if startswith(rest, "<!--") + i = findnext("-->", s, pos + 4) + isnothing(i) && error("Unterminated comment in DTD") + pos = last(i) + 1 + elseif startswith(rest, "<?") + i = findnext("?>", s, pos + 2) + isnothing(i) && error("Unterminated PI in DTD") + pos = last(i) + 1 + elseif startswith(rest, "<!ELEMENT") + elem, pos = _dtd_parse_element(s, pos + 9) + push!(elements, elem) + elseif startswith(rest, "<!ATTLIST") + atts, pos = _dtd_parse_attlist(s, pos + 9) + append!(attributes, atts) + elseif startswith(rest, "<!ENTITY") + ent, pos = _dtd_parse_entity(s, pos + 8) + push!(entities, ent) + elseif startswith(rest, "<!NOTATION") + not, pos = _dtd_parse_notation(s, pos + 10) + push!(notations, not) + elseif s[pos] == '%' + i = findnext(';', s, pos + 1) + isnothing(i) && error("Unterminated parameter entity reference in DTD") + pos = i + 1 + else + pos += 1 + end + end + end + + ParsedDTD(String(root), system_id, public_id, elements, attributes, entities, notations) +end + +function parse_dtd(node::Node) + node.nodetype === DTD || error("parse_dtd requires a DTD node.") + parse_dtd(node.value) +end + +#-----------------------------------------------------------------------------# deprecations +Base.@deprecate_binding simplevalue simple_value false + +# Removed types — informative errors +struct Raw + Raw(args...; kw...) = error(""" + `XML.Raw` has been removed in XML.jl v0.4. + Use `parse(str, Node)` or `read(filename, Node)` instead. + The streaming Raw/LazyNode API has been replaced by a token-based parser. + See `?XML.Node` for the new API.""") +end + +# Removed functions — informative errors +const _REMOVED_LAZYNODE_MSG = """ + This function was part of the LazyNode API, which has been removed in XML.jl v0.4. + Use `parse(str, Node)` to get a full DOM tree and navigate with `children`, `tag`, + `attributes`, `value`, and integer indexing (e.g. `node[1]`).""" + +for f in (:next, :prev) + msg = "`XML.$f` has been removed. $_REMOVED_LAZYNODE_MSG" + @eval function $f(o::Node) + Base.depwarn($msg, $(QuoteNode(f))) + error($msg) end +end + +# 1-arg parent/depth were part of LazyNode API; 2-arg versions are defined above +const _PARENT_1ARG_MSG = "`XML.parent(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n Use `parent(child, root)` instead to search from a known root node." +function Base.parent(o::Node) + Base.depwarn(_PARENT_1ARG_MSG, :parent) + error(_PARENT_1ARG_MSG) +end + +const _DEPTH_1ARG_MSG = "`XML.depth(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n Use `depth(child, root)` instead to search from a known root node." +function depth(o::Node) + Base.depwarn(_DEPTH_1ARG_MSG, :depth) + error(_DEPTH_1ARG_MSG) +end +function nodes_equal(a, b) + msg = """`XML.nodes_equal` has been removed in XML.jl v0.4. Use `==` instead: + a == b""" + Base.depwarn(msg, :nodes_equal) + error(msg) +end + +function escape!(o::Node, warn::Bool=true) + msg = """`XML.escape!` has been removed in XML.jl v0.4. + Text is now escaped automatically during `XML.write`.""" + Base.depwarn(msg, :escape!) + error(msg) +end + +function unescape!(o::Node, warn::Bool=true) + msg = """`XML.unescape!` has been removed in XML.jl v0.4. + Text is now unescaped automatically during `parse`.""" + Base.depwarn(msg, :unescape!) + error(msg) end end # module XML diff --git a/src/XMLTokenizer.jl b/src/XMLTokenizer.jl new file mode 100644 index 0000000..eee117b --- /dev/null +++ b/src/XMLTokenizer.jl @@ -0,0 +1,487 @@ +""" + XMLTokenizer + +A self-contained module for tokenizing XML documents into a fine-grained stream of tokens. + +# Usage + +```julia +using .XMLTokenizer: tokenize, tag_name, attr_value, pi_target + +for token in tokenize(\"\"\"<?xml version="1.0"?><root attr="val">text<!-- comment --></root>\"\"\") + println(token) +end +``` +""" +module XMLTokenizer + + +#-----------------------------------------------------------------------# TokenKind +@enum TokenKind::UInt8 begin + # Character data + TOKEN_TEXT # text content between markup + + # Element tags + TOKEN_OPEN_TAG # <name + TOKEN_CLOSE_TAG # </name + TOKEN_TAG_CLOSE # > + TOKEN_SELF_CLOSE # /> + TOKEN_ATTR_NAME # attribute name + TOKEN_ATTR_VALUE # "value" or 'value' (with quotes in raw) + + # CDATA sections + TOKEN_CDATA_OPEN # <![CDATA[ + TOKEN_CDATA_CONTENT # raw text content + TOKEN_CDATA_CLOSE # ]]> + + # Comments + TOKEN_COMMENT_OPEN # <!-- + TOKEN_COMMENT_CONTENT # comment text + TOKEN_COMMENT_CLOSE # --> + + # Processing instructions + TOKEN_PI_OPEN # <?target (includes target name) + TOKEN_PI_CONTENT # PI body text + TOKEN_PI_CLOSE # ?> + + # XML declaration (<?xml ...?>) + TOKEN_XML_DECL_OPEN # <?xml + TOKEN_XML_DECL_CLOSE # ?> + # (reuses TOKEN_ATTR_NAME / TOKEN_ATTR_VALUE for pseudo-attributes) + + # DOCTYPE + TOKEN_DOCTYPE_OPEN # <!DOCTYPE (or other <! declarations) + TOKEN_DOCTYPE_CONTENT # declaration body + TOKEN_DOCTYPE_CLOSE # > +end + +#-----------------------------------------------------------------------# Token +struct Token{S <: AbstractString} + kind::TokenKind + raw::SubString{S} +end + +function Base.show(io::IO, t::Token) + print(io, t.kind, ": ", repr(String(t.raw))) +end + +#-----------------------------------------------------------------------# Tokenizer mode +@enum _Mode::UInt8 begin + _M_DEFAULT # normal content mode + _M_TAG # inside open tag, reading attributes + _M_TAG_VALUE # expecting quoted attribute value + _M_CLOSE_TAG # inside close tag, expecting > + _M_XML_DECL # inside <?xml, reading pseudo-attributes + _M_XML_DECL_VALUE # expecting quoted attr value in xml decl + _M_COMMENT # after <!--, reading content + _M_CDATA # after <![CDATA[, reading content + _M_PI # after <?target, reading content + _M_DOCTYPE # after <!DOCTYPE, reading content +end + +#-----------------------------------------------------------------------# TokenizerState (immutable, SROA-friendly) +struct TokenizerState{S <: AbstractString} + pos::Int + mode::_Mode + pending::Token{S} +end + +@inline _no_token(s::AbstractString) = Token(TOKEN_TEXT, @inbounds SubString(s, 1, 0)) +@inline _has_pending(st::TokenizerState) = !isempty(st.pending.raw) + +@inline function _init_state(data::AbstractString, pos::Int=1) + TokenizerState(pos, _M_DEFAULT, _no_token(data)) +end + +#-----------------------------------------------------------------------# Tokenizer (immutable iterator) +""" + tokenize(xml::AbstractString) -> Tokenizer + +Return a lazy iterator of `Token`s over the XML string `xml`. +""" +struct Tokenizer{S <: AbstractString} + data::S + start::Int +end + +tokenize(xml::AbstractString) = Tokenizer(xml, 1) +tokenize(xml::AbstractString, pos::Int) = Iterators.Stateful(Tokenizer(xml, pos)) + +function Base.show(io::IO, t::Tokenizer) + n = ncodeunits(t.data) + print(io, "Tokenizer(") + t.start > 1 && print(io, t.start, "/") + print(io, Base.format_bytes(n), ")") +end + +Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown() +Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S} + +function Base.iterate(t::Tokenizer, st::TokenizerState=_init_state(t.data, t.start)) + result = _next_token(t.data, st) + result === nothing ? nothing : result +end + +#-----------------------------------------------------------------------# Internal helpers +@inline _iseof(data, pos) = pos > ncodeunits(data) +@inline _peek(data, pos) = @inbounds codeunit(data, pos) +@inline _canpeek(data, pos, offset) = pos + offset <= ncodeunits(data) + +@inline function _is_name_byte(b::UInt8) + (UInt8('a') <= b <= UInt8('z')) || (UInt8('A') <= b <= UInt8('Z')) || + (UInt8('0') <= b <= UInt8('9')) || b == UInt8('_') || b == UInt8('-') || + b == UInt8('.') || b == UInt8(':') +end + +@inline function _is_whitespace(b::UInt8) + b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r') +end + +@inline function _skip_whitespace(data, pos) + @inbounds while !_iseof(data, pos) && _is_whitespace(_peek(data, pos)) + pos += 1 + end + pos +end + +function _skip_quoted(data, pos) + q = @inbounds _peek(data, pos) + pos += 1 + @inbounds while !_iseof(data, pos) + _peek(data, pos) == q && return pos + 1 + pos += 1 + end + error("Unterminated quoted string") +end + +@noinline _err(msg, pos) = throw(ArgumentError("XML tokenizer error at position $pos: $msg")) + +#-----------------------------------------------------------------------# Main dispatch +function _next_token(data, st::TokenizerState) + if _has_pending(st) + return (st.pending, TokenizerState(st.pos, st.mode, _no_token(data))) + end + pos = st.pos + _iseof(data, pos) && return nothing + + s = st.mode + if s == _M_DEFAULT + _peek(data, pos) == UInt8('<') ? _read_markup(data, pos) : _read_text(data, pos) + elseif s == _M_TAG || s == _M_XML_DECL + _read_in_tag(data, pos, s) + elseif s == _M_TAG_VALUE || s == _M_XML_DECL_VALUE + _read_attr_value(data, pos, s) + elseif s == _M_CLOSE_TAG + _read_close_tag_end(data, pos) + elseif s == _M_COMMENT + _read_comment_body(data, pos) + elseif s == _M_CDATA + _read_cdata_body(data, pos) + elseif s == _M_PI + _read_pi_body(data, pos) + else # _M_DOCTYPE + _read_doctype_body(data, pos) + end +end + +#-----------------------------------------------------------------------# S_DEFAULT tokens +function _read_text(data, pos) + start = pos + @inbounds while !_iseof(data, pos) && _peek(data, pos) != UInt8('<') + pos += 1 + end + tok = Token(TOKEN_TEXT, @inbounds SubString(data, start, prevind(data, pos))) + (tok, TokenizerState(pos, _M_DEFAULT, _no_token(data))) +end + +function _read_markup(data, pos) + start = pos + pos += 1 # skip '<' + _iseof(data, pos) && _err("unexpected end of input after '<'", start) + + b = _peek(data, pos) + if b == UInt8('!') + _read_bang(data, pos + 1, start) + elseif b == UInt8('?') + _read_pi_start(data, pos + 1, start) + elseif b == UInt8('/') + _read_close_tag_start(data, pos + 1, start) + else + _read_open_tag_start(data, pos, start) + end +end + +#-----------------------------------------------------------------------# <! dispatch +function _read_bang(data, pos, start) + # Comment: <!-- + if !_iseof(data, pos) && _peek(data, pos) == UInt8('-') + pos += 1 + (!_iseof(data, pos) && _peek(data, pos) == UInt8('-')) || _err("expected '<!--'", start) + pos += 1 + tok = Token(TOKEN_COMMENT_OPEN, @inbounds SubString(data, start, pos - 1)) + return (tok, TokenizerState(pos, _M_COMMENT, _no_token(data))) + end + + # CDATA: <![CDATA[ + if !_iseof(data, pos) && _peek(data, pos) == UInt8('[') + pos += 1 + for expected in (UInt8('C'), UInt8('D'), UInt8('A'), UInt8('T'), UInt8('A'), UInt8('[')) + _iseof(data, pos) && _err("unterminated CDATA", start) + _peek(data, pos) == expected || _err("invalid CDATA section", start) + pos += 1 + end + tok = Token(TOKEN_CDATA_OPEN, @inbounds SubString(data, start, pos - 1)) + return (tok, TokenizerState(pos, _M_CDATA, _no_token(data))) + end + + # <!DOCTYPE ...> or other <! declaration + @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos)) + pos += 1 + end + tok = Token(TOKEN_DOCTYPE_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, _M_DOCTYPE, _no_token(data))) +end + +#-----------------------------------------------------------------------# <? (PI / XML declaration) +function _read_pi_start(data, pos, start) + name_start = pos + @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos)) + pos += 1 + end + + is_xml = (pos - name_start == 3) && + codeunit(data, name_start) == UInt8('x') && + codeunit(data, name_start + 1) == UInt8('m') && + codeunit(data, name_start + 2) == UInt8('l') + + if is_xml + tok = Token(TOKEN_XML_DECL_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, _M_XML_DECL, _no_token(data))) + else + tok = Token(TOKEN_PI_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, _M_PI, _no_token(data))) + end +end + +#-----------------------------------------------------------------------# Tags +function _read_open_tag_start(data, pos, start) + @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos)) + pos += 1 + end + tok = Token(TOKEN_OPEN_TAG, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, _M_TAG, _no_token(data))) +end + +function _read_close_tag_start(data, pos, start) + @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos)) + pos += 1 + end + tok = Token(TOKEN_CLOSE_TAG, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, _M_CLOSE_TAG, _no_token(data))) +end + +function _read_close_tag_end(data, pos) + pos = _skip_whitespace(data, pos) + _iseof(data, pos) && _err("unterminated close tag", pos) + _peek(data, pos) == UInt8('>') || _err("expected '>'", pos) + tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos)) + (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data))) +end + +#-----------------------------------------------------------------------# Attributes (shared by S_TAG and S_XML_DECL) +function _read_in_tag(data, pos, mode) + pos = _skip_whitespace(data, pos) + _iseof(data, pos) && _err("unterminated tag", pos) + + b = _peek(data, pos) + is_decl = (mode == _M_XML_DECL) + + # Check for end delimiters + if is_decl + if b == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>') + tok = Token(TOKEN_XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1)) + return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data))) + end + else + if b == UInt8('>') + tok = Token(TOKEN_TAG_CLOSE, @inbounds SubString(data, pos, pos)) + return (tok, TokenizerState(pos + 1, _M_DEFAULT, _no_token(data))) + end + if b == UInt8('/') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>') + tok = Token(TOKEN_SELF_CLOSE, @inbounds SubString(data, pos, pos + 1)) + return (tok, TokenizerState(pos + 2, _M_DEFAULT, _no_token(data))) + end + end + + # Attribute name + name_start = pos + @inbounds while !_iseof(data, pos) && _is_name_byte(_peek(data, pos)) + pos += 1 + end + name_end = pos - 1 + name_start > name_end && _err("expected attribute name or tag close", pos) + + # Consume '=' and surrounding whitespace (not part of any token) + pos = _skip_whitespace(data, pos) + (!_iseof(data, pos) && _peek(data, pos) == UInt8('=')) || _err("expected '=' after attribute name", pos) + pos += 1 + pos = _skip_whitespace(data, pos) + + next_state = is_decl ? _M_XML_DECL_VALUE : _M_TAG_VALUE + tok = Token(TOKEN_ATTR_NAME, @inbounds SubString(data, name_start, name_end)) + (tok, TokenizerState(pos, next_state, _no_token(data))) +end + +function _read_attr_value(data, pos, mode) + _iseof(data, pos) && _err("expected attribute value", pos) + + q = _peek(data, pos) + (q == UInt8('"') || q == UInt8('\'')) || _err("expected quoted attribute value", pos) + + start = pos + pos += 1 # skip opening quote + @inbounds while !_iseof(data, pos) && _peek(data, pos) != q + pos += 1 + end + _iseof(data, pos) && _err("unterminated attribute value", start) + pos += 1 # skip closing quote + + next_state = (mode == _M_XML_DECL_VALUE) ? _M_XML_DECL : _M_TAG + tok = Token(TOKEN_ATTR_VALUE, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, next_state, _no_token(data))) +end + +#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE) +function _read_comment_body(data, pos) + start = pos + @inbounds while !_iseof(data, pos) + if _peek(data, pos) == UInt8('-') && + _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') && + _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 3 + pending = Token(TOKEN_COMMENT_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TOKEN_COMMENT_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, _M_DEFAULT, pending)) + end + pos += 1 + end + _err("unterminated comment", start) +end + +function _read_cdata_body(data, pos) + start = pos + @inbounds while !_iseof(data, pos) + if _peek(data, pos) == UInt8(']') && + _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8(']') && + _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 3 + pending = Token(TOKEN_CDATA_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TOKEN_CDATA_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, _M_DEFAULT, pending)) + end + pos += 1 + end + _err("unterminated CDATA section", start) +end + +function _read_pi_body(data, pos) + start = pos + @inbounds while !_iseof(data, pos) + if _peek(data, pos) == UInt8('?') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 2 + pending = Token(TOKEN_PI_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TOKEN_PI_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, _M_DEFAULT, pending)) + end + pos += 1 + end + _err("unterminated processing instruction", start) +end + +function _read_doctype_body(data, pos) + start = pos + depth = 0 + @inbounds while !_iseof(data, pos) + b = _peek(data, pos) + if b == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') && + pos >= 2 && + codeunit(data, pos - 1) == UInt8('!') && + codeunit(data, pos - 2) == UInt8('<') + # Inside a <!-- comment: skip until --> + pos += 2 # skip "--" + while !_iseof(data, pos) + if _peek(data, pos) == UInt8('-') && _canpeek(data, pos, 1) && _peek(data, pos + 1) == UInt8('-') && + _canpeek(data, pos, 2) && _peek(data, pos + 2) == UInt8('>') + pos += 3 # skip "-->" + break + end + pos += 1 + end + elseif b == UInt8('"') || b == UInt8('\'') + pos = _skip_quoted(data, pos) + elseif b == UInt8('[') + depth += 1 + pos += 1 + elseif b == UInt8(']') + depth -= 1 + pos += 1 + elseif b == UInt8('>') && depth == 0 + content_end = prevind(data, pos) + close_start = pos + pos += 1 + pending = Token(TOKEN_DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1)) + tok = Token(TOKEN_DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end)) + return (tok, TokenizerState(pos, _M_DEFAULT, pending)) + else + pos += 1 + end + end + _err("unterminated DOCTYPE", start) +end + +#-----------------------------------------------------------------------# Utility functions + +""" + tag_name(token::Token) -> SubString{String} + +Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token. +""" +function tag_name(token::Token) + if token.kind == TOKEN_OPEN_TAG + @inbounds SubString(token.raw, 2, ncodeunits(token.raw)) # skip '<' + elseif token.kind == TOKEN_CLOSE_TAG + @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip '</' + else + throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)")) + end +end + +""" + attr_value(token::Token) -> SubString{String} + +Strip the surrounding quotes from an `ATTR_VALUE` token. +""" +function attr_value(token::Token) + token.kind == TOKEN_ATTR_VALUE || + throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)")) + @inbounds SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw))) +end + +""" + pi_target(token::Token) -> SubString{String} + +Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token. +""" +function pi_target(token::Token) + (token.kind == TOKEN_PI_OPEN || token.kind == TOKEN_XML_DECL_OPEN) || + throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)")) + @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip '<?' +end + +end # module XMLTokenizer diff --git a/src/dtd.jl b/src/dtd.jl deleted file mode 100644 index 58299f0..0000000 --- a/src/dtd.jl +++ /dev/null @@ -1,141 +0,0 @@ -# This is all a work in progress - -#-----------------------------------------------------------------------------# position_after -function position_after(needle::Vector{UInt8}, haystack::Vector{UInt8}, i) - x = findnext(needle, haystack, i) - isnothing(x) ? nothing : x[end] + 1 -end - -position_after(needle::String, haystack::Vector{UInt8}, i) = position_after(Vector{UInt8}(needle), haystack, i) - - -#-----------------------------------------------------------------------------# DeclaredElement -struct DeclaredElement - name::String - content::String # "ANY", "EMPTY", or "(children...)" - function DeclaredElement(name, content) - content in ("ANY", "EMPTY") || (content[1] == '(' && content[end] == ')') || - error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'. Got $content.") - new(name, content) - end -end -Base.show(io::IO, o::DeclaredElement) = print(io, "<!ELEMENT ", o.name, " ", o.content, ">") - -function get_declared_elements(data::Vector{UInt8}) - i = position_after("<!ELEMENT", data, 1) - out = DeclaredElement[] - while !isnothing(i) - name, i = get_name(data, i + 1) - i = findnext(!isspace, data, i) - if data[i] == UInt8('(') - j = findnext(==(UInt8(')')), data, i + 1) - content = String(data[i:j]) - else - content, i = get_name(data, i) - end - push!(out, DeclaredElement(name, content)) - i = position_after("<!ELEMENT", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DeclaredAttribute -struct DeclaredAttribute - element_name::String - attribute_name::String - attribute_type::String - attribute_value::String -end -Base.show(io::IO, o::DeclaredAttribute) = print(io, "<!ATTLIST ", o.element_name, " ", o.attribute_name, " ", o.attribute_type, " ", o.attribute_value, ">") - - -function get_declared_attributes(data) - i = position_after("<!ATTLIST", data, 1) - out = DeclaredAttribute[] - while !isnothing(i) - element_name, i = get_name(data, i) - attribute_name, i = get_name(data, i) - i = findnext(!isspace, data, i) - attribute_type = if data[i] == UInt('(') - j = findnext(==(UInt8(')')), data, i) - String(data[i:j]) - i = j + 1 - else - nm, i = get_name(data, i) - nm - end - i = findnext(!isspace, data, i) - is_hash = data[i] == UInt8('#') - val, i = get_name(data, i) - attribute_value = is_hash ? '#' * val : val - push!(out, DeclaredAttribute(element_name, attribute_name, attribute_type, attribute_value)) - i = position_after("<!ATTLIST", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DeclaredEntity -struct DeclaredEntity - name::String - external::Bool - value::String -end -function Base.show(io::IO, o::DeclaredEntity) - print(io, "<!ENTITY ", o.name, " ", o.external ? "SYSTEM" : "", repr(o.value), ">") -end - -function get_declared_entities(data) - i = position_after("<!ENTITY", data, 1) - out = DeclaredEntity[] - while !isnothing(i) - name, i = get_name(data, i) - value, i = get_name(data, i) - external = value == "SYSTEM" - if external - value, i = get_name(data, i) - end - push!(out, DeclaredEntity(name, external, value)) - i = position_after("<!ENTITY", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DTDBody -struct DTDBody - elements::Vector{DeclaredElement} - attributes::Vector{DeclaredAttribute} - entities::Vector{DeclaredEntity} -end - -function Base.show(io::IO, o::DTDBody) - printstyled(io, "DTDBody\n", color=:light_cyan) - printstyled(io, " DeclaredElements (", length(o.elements), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.elements) - printstyled(io, " DeclaredAttributes (", length(o.attributes), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.attributes) - printstyled(io, " DeclaredEntities (", length(o.entities), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.entities) -end - - -function DTDBody(data::Vector{UInt8}, file = false) - file && @goto isfile - i = position_after("<!DOCTYPE", data, 1) - root, i = get_name(data, i) - - i = findnext(==(UInt8('[')), data, i) - isnothing(i) && return DTDBody(root, [], [], []) - - @label isfile - elements = get_declared_elements(data) - attributes = get_declared_attributes(data) - entities = get_declared_entities(data) - return DTDBody(root, elements, attributes, entities) -end - - -Base.read(filename::String, ::Type{DTDBody}) = DTDBody(read(filename), true) -Base.read(io::IO, ::Type{DTDBody}) = DTDBody(read(io), true) - -Base.parse(s::AbstractString, ::Type{DTDBody}) = DTDBody(Vector{UInt8}(s)) -Base.parse(::Type{DTDBody}, s::AbstractString) = parse(s, DTDBody) diff --git a/src/lazynode.jl b/src/lazynode.jl new file mode 100644 index 0000000..2f7fea5 --- /dev/null +++ b/src/lazynode.jl @@ -0,0 +1,267 @@ +#-----------------------------------------------------------------------------# LazyNode +""" + LazyNode + +A lightweight, read-only view into an XML document that navigates the token stream on demand +instead of building a full tree in memory. + + doc = parse(xml_string, LazyNode) + doc = read("file.xml", LazyNode) + +Supports the same read-only interface as `Node`: [`nodetype`](@ref), [`tag`](@ref), +[`attributes`](@ref), [`value`](@ref), [`children`](@ref), plus integer and string indexing. +""" +struct LazyNode{S <: AbstractString} + data::S + token::Token{S} + nodetype::NodeType +end + +function LazyNode(data::S, nt::NodeType) where {S <: AbstractString} + LazyNode{S}(data, Token(TOKEN_TEXT, SubString(data, 1, 0)), nt) +end + +nodetype(n::LazyNode) = n.nodetype + +_lazy_pos(n::LazyNode) = n.token.raw.offset + 1 +_lazy_tokenizer(n::LazyNode) = tokenize(n.data, _lazy_pos(n)) + +#-----------------------------------------------------------------------------# tag / value +function tag(n::LazyNode) + nt = n.nodetype + if nt === Element + return String(tag_name(n.token)) + elseif nt === ProcessingInstruction + return String(pi_target(n.token)) + end + nothing +end + +function value(n::LazyNode) + nt = n.nodetype + if nt === Text + return unescape(n.token.raw) + elseif nt === Comment + iter = _lazy_tokenizer(n) + iterate(iter) # COMMENT_OPEN + return String(iterate(iter)[1].raw) + elseif nt === CData + iter = _lazy_tokenizer(n) + iterate(iter) # CDATA_OPEN + return String(iterate(iter)[1].raw) + elseif nt === DTD + iter = _lazy_tokenizer(n) + iterate(iter) # DOCTYPE_OPEN + return String(lstrip(iterate(iter)[1].raw)) + elseif nt === ProcessingInstruction + iter = _lazy_tokenizer(n) + iterate(iter) # PI_OPEN + result = iterate(iter) + result === nothing && return nothing + result[1].kind === TOKEN_PI_CONTENT || return nothing + content = strip(result[1].raw) + return isempty(content) ? nothing : String(content) + end + nothing +end + +#-----------------------------------------------------------------------------# attributes +function attributes(n::LazyNode) + n.nodetype in (Element, Declaration) || return nothing + iter = _lazy_tokenizer(n) + iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN + attrs = Pair{String,String}[] + for tok in iter + tok.kind === TOKEN_ATTR_NAME || break + name = String(tok.raw) + result = iterate(iter) + result === nothing && break + push!(attrs, name => unescape(attr_value(result[1]))) + end + isempty(attrs) ? nothing : Attributes(attrs) +end + +function Base.get(n::LazyNode, key::AbstractString, default) + n.nodetype in (Element, Declaration) || return default + iter = _lazy_tokenizer(n) + iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN + for tok in iter + tok.kind === TOKEN_ATTR_NAME || return default + if tok.raw == key + result = iterate(iter) + result === nothing && return default + return unescape(attr_value(result[1])) + else + iterate(iter) # skip value + end + end + default +end + +function Base.getindex(n::LazyNode, key::AbstractString) + val = get(n, key, _MISSING_ATTR) + val === _MISSING_ATTR && throw(KeyError(key)) + val +end + +function Base.haskey(n::LazyNode, key::AbstractString) + get(n, key, _MISSING_ATTR) !== _MISSING_ATTR +end + +function Base.keys(n::LazyNode) + n.nodetype in (Element, Declaration) || return () + iter = _lazy_tokenizer(n) + iterate(iter) + result = String[] + for tok in iter + tok.kind === TOKEN_ATTR_NAME || break + push!(result, String(tok.raw)) + iterate(iter) # skip value + end + result +end + +#-----------------------------------------------------------------------------# children +function children(n::LazyNode{S}) where {S} + nt = n.nodetype + if nt === Document + return _lazy_collect_children(n.data, _lazy_tokenizer(n)) + elseif nt !== Element + return () + end + iter = _lazy_tokenizer(n) + for tok in iter + tok.kind === TOKEN_SELF_CLOSE && return LazyNode{S}[] + tok.kind === TOKEN_TAG_CLOSE && break + end + _lazy_collect_children(n.data, iter) +end + +function _lazy_collect_children(data::S, iter) where {S <: AbstractString} + result = LazyNode{S}[] + for tok in iter + k = tok.kind + if k === TOKEN_TEXT + push!(result, LazyNode(data, tok, Text)) + elseif k === TOKEN_OPEN_TAG + push!(result, LazyNode(data, tok, Element)) + _lazy_skip_element!(iter) + elseif k === TOKEN_COMMENT_OPEN + push!(result, LazyNode(data, tok, Comment)) + _lazy_skip_until!(iter, TOKEN_COMMENT_CLOSE) + elseif k === TOKEN_CDATA_OPEN + push!(result, LazyNode(data, tok, CData)) + _lazy_skip_until!(iter, TOKEN_CDATA_CLOSE) + elseif k === TOKEN_PI_OPEN + push!(result, LazyNode(data, tok, ProcessingInstruction)) + _lazy_skip_until!(iter, TOKEN_PI_CLOSE) + elseif k === TOKEN_XML_DECL_OPEN + push!(result, LazyNode(data, tok, Declaration)) + _lazy_skip_until!(iter, TOKEN_XML_DECL_CLOSE) + elseif k === TOKEN_DOCTYPE_OPEN + push!(result, LazyNode(data, tok, DTD)) + _lazy_skip_until!(iter, TOKEN_DOCTYPE_CLOSE) + elseif k === TOKEN_CLOSE_TAG + break + end + end + result +end + +function _lazy_skip_element!(iter) + depth = 1 + for tok in iter + k = tok.kind + if k === TOKEN_OPEN_TAG + depth += 1 + elseif k === TOKEN_SELF_CLOSE + depth -= 1 + depth == 0 && return + elseif k === TOKEN_CLOSE_TAG + depth -= 1 + if depth == 0 + iterate(iter) # consume trailing TAG_CLOSE + return + end + end + end +end + +function _lazy_skip_until!(iter, target::TokenKind) + for tok in iter + tok.kind === target && return + end +end + +#-----------------------------------------------------------------------------# is_simple / simple_value +function is_simple(n::LazyNode) + n.nodetype === Element || return false + attrs = attributes(n) + (!isnothing(attrs) && !isempty(attrs)) && return false + ch = children(n) + length(ch) == 1 && ch[1].nodetype in (Text, CData) +end + +function simple_value(n::LazyNode) + n.nodetype === Element || error("`simple_value` is only defined for simple nodes.") + attrs = attributes(n) + (!isnothing(attrs) && !isempty(attrs)) && error("`simple_value` is only defined for simple nodes.") + ch = children(n) + length(ch) == 1 && ch[1].nodetype in (Text, CData) || error("`simple_value` is only defined for simple nodes.") + value(ch[1]) +end + +#-----------------------------------------------------------------------------# indexing +Base.getindex(n::LazyNode, i::Integer) = children(n)[i] +Base.getindex(n::LazyNode, ::Colon) = children(n) +Base.lastindex(n::LazyNode) = lastindex(children(n)) +Base.only(n::LazyNode) = only(children(n)) +Base.length(n::LazyNode) = length(children(n)) + +#-----------------------------------------------------------------------------# parse / read +Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode) +Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), Document) + +Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode) +Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode) + +#-----------------------------------------------------------------------------# show +function Base.show(io::IO, n::LazyNode) + nt = n.nodetype + print(io, "Lazy ", nt) + if nt === Text + print(io, ' ', repr(value(n))) + elseif nt === Element + print(io, " <", tag(n)) + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, '>') + elseif nt === DTD + print(io, " <!DOCTYPE ", value(n), '>') + elseif nt === Declaration + print(io, " <?xml") + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, "?>") + elseif nt === ProcessingInstruction + print(io, " <?", tag(n)) + v = value(n) + !isnothing(v) && print(io, ' ', v) + print(io, "?>") + elseif nt === Comment + print(io, " <!--", value(n), "-->") + elseif nt === CData + print(io, " <![CDATA[", value(n), "]]>") + elseif nt === Document + n_ch = length(children(n)) + n_ch > 0 && print(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)") + end +end diff --git a/src/raw.jl b/src/raw.jl deleted file mode 100644 index 29d0a10..0000000 --- a/src/raw.jl +++ /dev/null @@ -1,568 +0,0 @@ -#-----------------------------------------------------------------------------# RawType -""" - RawType: - - RawText # text - - RawComment # <!-- ... --> - - RawCData # <![CData[...]]> - - RawDeclaration # <?xml attributes... ?> - - RawProcessingInstruction # <?NAME attributes... ?> - - RawDTD # <!DOCTYPE ...> - - RawElementOpen # <NAME attributes... > - - RawElementClose # </NAME> - - RawElementSelfClosed # <NAME attributes... /> - - RawDocument # Something to initialize with (not really used) -""" -@enum(RawType, RawDocument, RawText, RawComment, RawCData, RawProcessingInstruction, - RawDeclaration, RawDTD, RawElementOpen, RawElementClose, RawElementSelfClosed) - -@inline nodetype(x::RawType) = - x === RawElementOpen ? Element : - x === RawElementClose ? Element : - x === RawElementSelfClosed ? Element : - x === RawText ? Text : - x === RawComment ? Comment : - x === RawCData ? CData : - x === RawDeclaration ? Declaration : - x === RawDTD ? DTD : - x === RawProcessingInstruction ? ProcessingInstruction : - x === RawDocument ? Document : - nothing - -#-----------------------------------------------------------------------------# Raw -""" - Raw(filename::String) - -Create an iterator over raw chunks of data in an XML file. Each chunk of data represents one of: - - - RawDocument # Only used to initialize the iterator state. - - RawText # text - - RawComment # <!-- ... --> - - RawCData # <![CData[...]]> - - RawDeclaration # <?xml attributes... ?> - - RawProcessingInstruction # <?NAME attributes... ?> - - RawDTD # <!DOCTYPE ...> - - RawElementOpen # <NAME attributes... > - - RawElementClose # </NAME> - - RawElementSelfClosed # <NAME attributes... /> - -Useful functions: - - - view(o::Raw) --> view of the Vector{UInt8} chunk. - - String(o::Raw) --> String of the chunk. - - next(o::Raw) --> Raw of the next chunk (or `nothing`). - - prev(o::Raw) --> Raw of the previous chunk (or `nothing`). - - tag(o::Raw) --> String of the tag name (or `nothing`). - - attributes(o::Raw) --> OrderedDict{String, String} of the attributes (or `nothing`). - - value(o::Raw) --> String of the value (or `nothing`). - - children(o::Raw) --> Vector{Raw} of the children (or `nothing`). - - parent(o::Raw) --> Raw of the parent (or `nothing`) - - depth(o::Raw) --> Int of the depth of the node in the XML DOM. -""" -struct Raw - type::RawType - depth::Int - pos::Int - len::Int - data::Vector{UInt8} - ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context) - has_xml_space::Bool # Whether data contains `xml:space` attribute at least once -end -function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false]) - needle = Vector{UInt8}("xml:space") - has_xml_space = findfirst(needle, data) !== nothing - return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space) -end -function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false]) - return Raw(RawDocument, 0, 0, 0, data, ctx, has_xml_space) -end - -const _RAW_INDEX = WeakKeyDict{Vector{UInt8}, Any}() - -struct _TokRec - type::RawType - depth::Int - pos::Int - len::Int - ctx::Vector{Bool} -end - -mutable struct _Index - recs::Vector{_TokRec} - last_raw::Raw - built_end::Int -end - -Base.read(filename::String, ::Type{Raw}) = isfile(filename) ? - Raw(Mmap.mmap(filename)) : - error("File \"$filename\" does not exist.") - -Base.read(io::IO, ::Type{Raw}) = Raw(read(io)) - -Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x)) - -# Mostly for debugging -Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)])) - -function Base.show(io::IO, o::Raw) - print(io, o.type, ':', o.depth, " (pos=", o.pos, ", len=", o.len, ")") - o.len > 0 && printstyled(io, ": ", String(o); color=:light_green) -end -function Base.:(==)(a::Raw, b::Raw) - a.type == b.type && a.depth == b.depth && a.pos == b.pos && a.len == b.len && a.data === b.data && a.ctx == b.ctx && a.has_xml_space == b.has_xml_space -end - -Base.view(o::Raw) = view(o.data, o.pos:o.pos+o.len) -Base.String(o::Raw) = String(view(o)) - -Base.IteratorSize(::Type{Raw}) = Base.SizeUnknown() -Base.eltype(::Type{Raw}) = Raw - -function Base.iterate(o::Raw, state=o) - n = next(state) - return isnothing(n) ? nothing : (n, n) -end - -is_node(o::Raw) = o.type !== RawElementClose -xml_nodes(o::Raw) = Iterators.Filter(is_node, o) - -#-----------------------------------------------------------------------------# get_name -is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_') -is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':') - -name_start(data, i) = findnext(is_name_start_char, data, i) -name_stop(data, i) = findnext(!is_name_char, data, i) - 1 - -function get_name(data, i) - i = name_start(data, i) - j = name_stop(data, i) - @views String(data[i:j]), j + 1 -end - -#-----------------------------------------------------------------------------# get_attributes -# starting at position i, return attributes up until the next '>' or '?' (DTD) -function get_attributes(data, i, j) - i = name_start(data, i) - (isnothing(j) || isnothing(i) || i > j) && return nothing - out = OrderedDict{String,String}() - while !isnothing(i) && i < j - key, i = get_name(data, i) - # get quotechar the value is wrapped in (either ' or ") - i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1) - quotechar = data[i] - i2 = findnext(==(quotechar), data, i + 1) - @views value = String(data[i+1:i2-1]) - out[key] = value - i = name_start(data, i2) - end - return out -end - -# ----------------------------------------------------------------------------# Utilities supporting prev -function _get_or_init_index(o::Raw) - idx = get(_RAW_INDEX, o.data, nothing) - if idx === nothing - start = Raw(o.data) # fresh RawDocument - _RAW_INDEX[o.data] = _Index(_TokRec[], start, 0) - idx = _RAW_INDEX[o.data] - end - return idx -end -function _ensure_index_upto!(o::Raw, target_pos::Int) - idx = _get_or_init_index(o) - r = idx.last_raw - while true - n = next(r) - if n === nothing - idx.built_end = typemax(Int) - idx.last_raw = r - return idx - end - push!(idx.recs, _TokRec(n.type, n.depth, n.pos, n.len, copy(n.ctx))) - endpos = n.pos + n.len - idx.built_end = endpos - idx.last_raw = n - r = n - if endpos >= target_pos - return idx - end - end -end -function _find_prev_token(recs::Vector{_TokRec}, p::Int) - lo, hi = 1, length(recs) - ans = 0 - while lo <= hi - mid = (lo + hi) >>> 1 - endpos = recs[mid].pos + recs[mid].len - if endpos < p + 1 - ans = mid - lo = mid + 1 - else - hi = mid - 1 - end - end - return ans == 0 ? nothing : recs[ans] -end - -#-----------------------------------------------------------------------------# update xml:space context -# check attributes for xml:space and update ctx if necessary -function get_ctx(o) - att = attributes(o) - if !isnothing(att) && haskey(att, "xml:space") - if att["xml:space"] == "preserve" - return true - elseif att["xml:space"] == "default" - return false - else - error("Invalid value for xml:space attribute: $(att["xml:space"]). Must be 'preserve' or 'default'.") - end - end - return nothing -end -function update_ctx!(ctx, o) - new_ctx = get_ctx(o) - if new_ctx !== nothing - ctx[end] = new_ctx - end - return nothing -end - -#-----------------------------------------------------------------------------# interface -""" - nodetype(node) --> XML.NodeType - -Return the `XML.NodeType` of the node. -""" -nodetype(o::Raw) = nodetype(o.type) - -""" - tag(node) --> String or Nothing - -Return the tag name of `Element` and `PROCESSING_INSTRUCTION` nodes. -""" -function tag(o::Raw) - o.type ∉ [RawElementOpen, RawElementClose, RawElementSelfClosed, RawProcessingInstruction] && return nothing - return get_name(o.data, o.pos + 1)[1] -end - -""" - attributes(node) --> OrderedDict{String, String} or Nothing - -Return the attributes of `Element`, `Declaration`, or `ProcessingInstruction` nodes. -""" -function attributes(o::Raw) - if o.type === RawElementOpen || o.type === RawElementSelfClosed || o.type === RawProcessingInstruction - i = o.pos - i = name_start(o.data, i) - i = name_stop(o.data, i) - get_attributes(o.data, i + 1, o.pos + o.len) - elseif o.type === RawDeclaration - get_attributes(o.data, o.pos + 6, o.pos + o.len) - else - nothing - end -end - -""" - value(node) --> String or Nothing - -Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes. -""" -function value(o::Raw) - if o.type === RawText - String(o) - elseif o.type === RawCData - String(view(o.data, o.pos+length("<![CData["):o.pos+o.len-3)) - elseif o.type === RawComment - String(view(o.data, o.pos+length("<!--"):o.pos+o.len-3)) - elseif o.type === RawDTD - String(view(o.data, o.pos+length("<!DOCTYPE "):o.pos+o.len-1)) - else - nothing - end -end - -""" - children(node) --> Vector{typeof(node)} - -Return the children the node. Will only be nonempty for `Element` and `Document` nodes. -""" -function children(o::Raw) - if o.type === RawElementOpen || o.type === RawDocument - depth = o.depth - out = Raw[] - for item in xml_nodes(o) - if item.depth == depth + 1 - push!(out, item) - end - item.depth == depth && break - o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root - end - out - else - Raw[] - end -end - -""" - depth(node) --> Int - -Return the depth of the node. Will be `0` for `Document` nodes. Not defined for `XML.Node`. -""" -function depth(o::Raw) - o.depth -end - -""" - parent(node) --> typeof(node), Nothing - -Return the parent of the node. Will be `nothing` for `Document` nodes. Not defined for `XML.Node`. -""" -function parent(o::Raw) - depth = o.depth - depth === 0 && return nothing - p = prev(o) - while p.depth >= depth - p = prev(p) - end - return p -end - -#-----------------------------------------------------------------------------# next Raw -# isspace(x::UInt8) = Base.isspace(Char(x)) - -# XML whitespace per XML 1.0/1.1 production S: -# S ::= (#x20 | #x9 | #xD | #xA)+ -@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D) - -""" - next(node) --> typeof(node) or Nothing - -Return the next node in the document during depth-first traversal. Depth-first is the order you -would visit nodes by reading top-down through an XML file. Not defined for `XML.Node`. -""" -function next(o::Raw) - if o.has_xml_space # using xml:space context at least once in data - return next_xml_space(o) - else # not using xml:space context at all (same as v0.3.5) - return next_no_xml_space(o) - end -end - -function next_xml_space(o::Raw) - i = o.pos + o.len + 1 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = copy(o.ctx) - last_type = type - k = findnext(!xml_isspace, data, i) - if isnothing(k) - return nothing - end - if last_type === RawElementOpen || last_type === RawDocument - depth += 1 - push!(ctx, ctx[end]) # inherit the xml:space context from parent - last_type === RawElementOpen && update_ctx!(ctx, o) # check attributes for xml:space and update if necessary - end - i = ctx[end] ? i : k - b = i > 1 ? Char(o.data[i-1]) : Char('<') - c = Char(o.data[i]) - d = Char(o.data[k+1]) - if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/' - type = RawText - j = findnext(==(UInt8('<')), data, i) - 1 - j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed - if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument - # Maybe drop pure-whitespace inter-element text nodes? - # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node) - #if all(xml_isspace, @view data[i:j]) && depth > 1 - # return next(Raw(type, depth, j, 0, data, ctx, has_xml_space)) - #end - end - else - i = k - j = k + 1 - if c === '<' - c2 = Char(o.data[i+1]) - if c2 === '!' - c3 = Char(o.data[i+2]) - if c3 === '-' - type = RawComment - j = findnext(Vector{UInt8}("-->"), data, i)[end] - elseif c3 === '[' - type = RawCData - j = findnext(Vector{UInt8}("]]>"), data, i)[end] - elseif c3 === 'D' || c3 == 'd' - type = RawDTD - j = findnext(==(UInt8('>')), data, i) - while sum(==(UInt8('>')), @view data[k:j]) != sum(==(UInt8('<')), @view data[i:j]) - j = findnext(==(UInt8('>')), data, j + 1) - end - end - elseif c2 === '?' - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - j = findnext(Vector{UInt8}("?>"), data, i)[end] - elseif c2 === '/' - type = RawElementClose - depth -= 1 - pop!(ctx) # revert to parent xml:space context - j = findnext(==(UInt8('>')), data, i) - else - j = findnext(==(UInt8('>')), data, i) - if data[j-1] === UInt8('/') - type = RawElementSelfClosed - else - type = RawElementOpen - end - end - end - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - -function next_no_xml_space(o::Raw) # same as v0.3.5 - i = o.pos + o.len + 1 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = [false] - i = findnext(!xml_isspace, data, i) - if isnothing(i) - return nothing - end - if type === RawElementOpen || type === RawDocument - depth += 1 - end - c = Char(o.data[i]) - d = Char(o.data[i+1]) - if c !== '<' - type = RawText - j = findnext(==(UInt8('<')), data, i) - 1 - j = findprev(!xml_isspace, data, j) # "rstrip" - elseif c === '<' - c2 = Char(o.data[i+1]) - if c2 === '!' - c3 = Char(o.data[i+2]) - if c3 === '-' - type = RawComment - j = findnext(Vector{UInt8}("-->"), data, i)[end] - elseif c3 === '[' - type = RawCData - j = findnext(Vector{UInt8}("]]>"), data, i)[end] - elseif c3 === 'D' || c3 == 'd' - type = RawDTD - j = findnext(==(UInt8('>')), data, i) - while sum(==(UInt8('>')), @view data[i:j]) != sum(==(UInt8('<')), @view data[i:j]) - j = findnext(==(UInt8('>')), data, j + 1) - end - end - elseif c2 === '?' - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - j = findnext(Vector{UInt8}("?>"), data, i)[end] - elseif c2 === '/' - type = RawElementClose - depth -= 1 - j = findnext(==(UInt8('>')), data, i) - else - j = findnext(==(UInt8('>')), data, i) - if data[j-1] === UInt8('/') - type = RawElementSelfClosed - else - type = RawElementOpen - end - end - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - -#-----------------------------------------------------------------------------# prev Raw -""" - prev(node) --> typeof(node), Nothing, or Missing (only for XML.Node) - -Return the previous node in the document during depth-first traversal. Not defined for `XML.Node`. -""" -function prev(o::Raw) - if o.has_xml_space # using xml:space context at least once in data - return prev_xml_space(o) - else # not using xml:space context at all (same as v0.3.5) - return prev_no_xml_space(o) - end -end - -function prev_xml_space(o::Raw) - o.type === RawDocument && return nothing - - idx = _ensure_index_upto!(o, o.pos - 1) - rec = _find_prev_token(idx.recs, o.pos - 1) - if rec === nothing - return Raw(o.data, o.has_xml_space, copy(o.ctx)) - end - return Raw(rec.type, rec.depth, rec.pos, rec.len, o.data, copy(rec.ctx), o.has_xml_space) -end -function prev_no_xml_space(o::Raw) # same as v0.3.5 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = has_xml_space ? copy(o.ctx) : [false] - type === RawDocument && return nothing - j = o.pos - 1 - j = findprev(!xml_isspace, data, j) - if isnothing(j) - return Raw(data, has_xml_space, ctx) # RawDocument - end - c = Char(o.data[j]) - next_type = type - if c !== '>' # text - type = RawText - i = findprev(==(UInt8('>')), data, j) + 1 - i = findnext(!xml_isspace, data, i) # "lstrip" - elseif c === '>' - c2 = Char(o.data[j-1]) - if c2 === '-' - type = RawComment - i = findprev(Vector{UInt8}("<--"), data, j)[1] - elseif c2 === ']' - type = RawCData - i = findprev(Vector{UInt8}("<![CData["), data, j)[1] - elseif c2 === '?' - i = findprev(Vector{UInt8}("<?"), data, j)[1] - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - else - i = findprev(==(UInt8('<')), data, j) - char = Char(data[i+1]) - if char === '/' - type = RawElementClose - elseif char === '!' - type = DTD - elseif isletter(char) || char === '_' - type = Char(o.data[j-2]) === '/' ? RawElementSelfClosed : RawElementOpen - else - error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.") - end - end - else - error("Unreachable reached in XML.prev") - end - if type !== RawElementOpen && next_type === RawElementClose - depth += 1 - elseif type === RawElementOpen && next_type !== RawElementClose - depth -= 1 - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - diff --git a/src/xpath.jl b/src/xpath.jl new file mode 100644 index 0000000..e56b38d --- /dev/null +++ b/src/xpath.jl @@ -0,0 +1,303 @@ +#-----------------------------------------------------------------------------# XPath +# A subset of XPath 1.0 for querying XML.Node trees. +# +# Supported syntax: +# / root (absolute path) +# tag child element by name +# * any child element +# // descendant-or-self (recursive) +# . current node +# .. parent node +# [n] positional predicate (1-based) +# [@attr] has-attribute predicate +# [@attr='v'] attribute-value predicate +# text() text node children +# node() all node children +# @attr attribute value (returns strings) + +#-----------------------------------------------------------------------------# Token types + +@enum XPathTokenKind::UInt8 begin + XPATH_ROOT # / + XPATH_DESCENDANT # // + XPATH_NAME # tag name + XPATH_WILDCARD # * + XPATH_DOT # . + XPATH_DOTDOT # .. + XPATH_TEXT_FN # text() + XPATH_NODE_FN # node() + XPATH_PREDICATE # [...] + XPATH_ATTRIBUTE # @attr (in result position) +end + +struct XPathToken + kind::XPathTokenKind + value::String +end + +#-----------------------------------------------------------------------------# Tokenizer + +function _xpath_tokenize(expr::AbstractString) + tokens = XPathToken[] + s = String(expr) + i = 1 + n = ncodeunits(s) + + while i <= n + c = s[i] + + if c == '/' + if i < n && s[i+1] == '/' + push!(tokens, XPathToken(XPATH_DESCENDANT, "//")) + i += 2 + else + push!(tokens, XPathToken(XPATH_ROOT, "/")) + i += 1 + end + + elseif c == '.' + if i < n && s[i+1] == '.' + push!(tokens, XPathToken(XPATH_DOTDOT, "..")) + i += 2 + else + push!(tokens, XPathToken(XPATH_DOT, ".")) + i += 1 + end + + elseif c == '*' + push!(tokens, XPathToken(XPATH_WILDCARD, "*")) + i += 1 + + elseif c == '[' + j = findnext(']', s, i + 1) + isnothing(j) && error("Unterminated predicate in XPath: $(repr(s))") + push!(tokens, XPathToken(XPATH_PREDICATE, SubString(s, i + 1, j - 1))) + i = j + 1 + + elseif c == '@' + j = i + 1 + while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j])) + j += 1 + end + j == i + 1 && error("Empty attribute name after @ in XPath: $(repr(s))") + push!(tokens, XPathToken(XPATH_ATTRIBUTE, SubString(s, i + 1, j - 1))) + i = j + + elseif isletter(c) || c == '_' + j = i + 1 + while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]) || s[j] == '.') + j += 1 + end + name = SubString(s, i, j - 1) + # Check for function calls: text(), node() + if j <= n && s[j] == '(' + j2 = findnext(')', s, j + 1) + isnothing(j2) && error("Unterminated function call in XPath: $(repr(s))") + if name == "text" + push!(tokens, XPathToken(XPATH_TEXT_FN, "text()")) + elseif name == "node" + push!(tokens, XPathToken(XPATH_NODE_FN, "node()")) + else + error("Unknown XPath function: $name()") + end + i = j2 + 1 + else + push!(tokens, XPathToken(XPATH_NAME, String(name))) + i = j + end + + elseif isspace(c) + i += 1 + + else + error("Unexpected character '$(c)' in XPath: $(repr(s))") + end + end + tokens +end + +#-----------------------------------------------------------------------------# Predicate evaluation + +const _RE_ATTR_PRED = r"^@([A-Za-z_:][\w.\-:]*)$" +const _RE_ATTR_VAL_PRED = r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$" + +function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S + s = strip(predicate) + + # Positional: [n] + pos = tryparse(Int, s) + if !isnothing(pos) + 1 <= pos <= length(nodes) || return Node{S}[] + return [nodes[pos]] + end + + # last() + if s == "last()" + isempty(nodes) && return Node{S}[] + return [nodes[end]] + end + + # [@attr] — has attribute + m = match(_RE_ATTR_PRED, s) + if !isnothing(m) + attr_name = m.captures[1] + return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes) + end + + # [@attr='value'] or [@attr="value"] + m = match(_RE_ATTR_VAL_PRED, s) + if !isnothing(m) + attr_name = m.captures[1] + attr_val = m.captures[2] + return filter(n -> n.nodetype === Element && get(n, attr_name, nothing) == attr_val, nodes) + end + + error("Unsupported XPath predicate: [$predicate]") +end + +#-----------------------------------------------------------------------------# Step evaluation + +function _xpath_step(nodes::Vector{Node{S}}, token::XPathToken, root::Node{S}) where S + result = Node{S}[] + k = token.kind + + if k === XPATH_NAME + for n in nodes + for c in children(n) + c.nodetype === Element && c.tag == token.value && push!(result, c) + end + end + + elseif k === XPATH_WILDCARD + for n in nodes + for c in children(n) + c.nodetype === Element && push!(result, c) + end + end + + elseif k === XPATH_DOT + append!(result, nodes) + + elseif k === XPATH_DOTDOT + for n in nodes + n === root && continue + p = _find_parent(n, root) + isnothing(p) || push!(result, p) + end + + elseif k === XPATH_TEXT_FN + for n in nodes + for c in children(n) + c.nodetype === Text && push!(result, c) + end + end + + elseif k === XPATH_NODE_FN + for n in nodes + append!(result, children(n)) + end + + elseif k === XPATH_DESCENDANT + # Handled by caller — collects all descendants before next step + error("XPATH_DESCENDANT should be handled by the evaluator, not _xpath_step") + end + + result +end + +function _descendants!(out::Vector{Node{S}}, node::Node{S}) where S + for c in children(node) + push!(out, c) + _descendants!(out, c) + end +end + +function _descendants(nodes::Vector{Node{S}}) where S + result = Node{S}[] + for n in nodes + push!(result, n) # descendant-or-self includes self + _descendants!(result, n) + end + result +end + +#-----------------------------------------------------------------------------# Main evaluator + +""" + xpath(node::Node, expr::AbstractString) -> Vector{Node} + +Evaluate an XPath expression against a `Node` tree and return matching nodes. + +Supports a practical subset of XPath 1.0: +- Absolute (`/root/child`) and relative (`child/sub`) paths +- Recursive descent (`//tag`) +- Wildcards (`*`), self (`.`), parent (`..`) +- Positional predicates (`[1]`, `[last()]`) +- Attribute predicates (`[@attr]`, `[@attr='value']`) +- `text()` and `node()` functions +- Attribute selection (`@attr`) — returns `Text` nodes containing attribute values + +# Examples +```julia +doc = parse("<root><a x='1'/><a x='2'/><b/></root>", Node) +xpath(doc, "/root/a") # both <a> elements +xpath(doc, "/root/a[1]") # first <a> +xpath(doc, "//a[@x='2']") # <a x="2"/> +xpath(doc, "/root/b/@x") # attribute value as Text node (empty here) +``` +""" +function xpath(node::Node{S}, expr::AbstractString) where S + tokens = _xpath_tokenize(expr) + isempty(tokens) && return Node{S}[] + + # Determine root for .. navigation + root = node.nodetype === Document ? node : node + + i = 1 + # Start context + if tokens[1].kind === XPATH_ROOT + # Absolute path — start from the document or its root element + if node.nodetype === Document + current = Node{S}[node] + else + current = Node{S}[node] + end + i = 2 + else + current = Node{S}[node] + end + + while i <= length(tokens) + tok = tokens[i] + + if tok.kind === XPATH_PREDICATE + current = _eval_predicate(tok.value, current, root) + i += 1 + + elseif tok.kind === XPATH_DESCENDANT + current = _descendants(current) + # // must be followed by a step + i += 1 + + elseif tok.kind === XPATH_ROOT + # / as separator between steps — skip + i += 1 + + elseif tok.kind === XPATH_ATTRIBUTE + # @attr in result position — return attribute values as Text nodes + result = Node{S}[] + for n in current + v = get(n, tok.value, nothing) + !isnothing(v) && push!(result, Node{S}(Text, nothing, nothing, v, nothing)) + end + current = result + i += 1 + + else + current = _xpath_step(current, tok, root) + i += 1 + end + end + + current +end diff --git a/test/Project.toml b/test/Project.toml index d4883bd..c1703f7 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/data/complex_dtd.xml b/test/data/complex_dtd.xml new file mode 100644 index 0000000..cb69747 --- /dev/null +++ b/test/data/complex_dtd.xml @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!DOCTYPE test [ +<!-- ===== Bookstore DTD (complex demo) ===== --> + +<!-- Reusable parameter entities --> +<!ENTITY % text "(#PCDATA | em | code | xref | br)*"> +<!ENTITY % block "p | ul | ol | figure | table"> + +<!-- Notations (used by unparsed entities) --> +<!NOTATION jpeg SYSTEM "image/jpeg"> +<!NOTATION png SYSTEM "image/png"> + +<!-- Unparsed external entities (binary media) --> +<!ENTITY cover1 SYSTEM "covers/b123.jpg" NDATA jpeg> +<!ENTITY cover2 SYSTEM "covers/b456.png" NDATA png> + +<!ELEMENT catalog (metadata?, (book | magazine)+)> +<!ATTLIST catalog + tier CDATA #FIXED "retail" + xml:lang CDATA #IMPLIED> + +<!ELEMENT metadata (publisher?, contact?)> +<!ELEMENT publisher %text;> +<!ELEMENT contact (email, phone?)> +<!ELEMENT email (#PCDATA)> +<!ELEMENT phone (#PCDATA)> + +<!ELEMENT book (title, subtitle?, authors, pubinfo, description?, section*, reviews?, related?)> +<!ATTLIST book + id ID #REQUIRED + isbn CDATA #IMPLIED + format (hardcover | paperback | ebook) "paperback" + inStock (yes | no) #REQUIRED + xml:space (default | preserve) "default"> + +<!ELEMENT magazine (title, issue, article+)> +<!ATTLIST magazine id ID #REQUIRED> +<!ELEMENT issue (#PCDATA)> + +<!ELEMENT title %text;> +<!ELEMENT subtitle %text;> + +<!ELEMENT authors (author+)> +<!ELEMENT author (name, affiliation?)> +<!ATTLIST author id ID #IMPLIED> +<!ELEMENT name %text;> +<!ELEMENT affiliation %text;> + +<!ELEMENT pubinfo (publisher, year, price?, pages?)> +<!ELEMENT year (#PCDATA)> +<!ELEMENT price (#PCDATA)> +<!ELEMENT pages (#PCDATA)> + +<!ELEMENT description (%block;)*> + +<!ELEMENT section (title, (%block;)*, section*)> +<!ATTLIST section id ID #IMPLIED> + +<!ELEMENT p %text;> +<!ELEMENT ul (li+)> +<!ELEMENT ol (li+)> +<!ELEMENT li %text;> + +<!ELEMENT figure (caption?, media)> +<!ATTLIST figure + entity ENTITY #IMPLIED <!-- refers to cover1/cover2 --> + notation NOTATION (jpeg | png) #IMPLIED> +<!ELEMENT caption %text;> +<!ELEMENT media EMPTY> +<!ATTLIST media + src CDATA #REQUIRED + alt CDATA #IMPLIED> + +<!ELEMENT table (thead?, tbody, tfoot?)> +<!ELEMENT thead (tr+)> +<!ELEMENT tbody (tr+)> +<!ELEMENT tfoot (tr+)> +<!ELEMENT tr (th | td)+> +<!ELEMENT th %text;> +<!ELEMENT td %text;> + +<!ELEMENT reviews (review+)> +<!ELEMENT review (rating, p+)> +<!ATTLIST review by IDREF #REQUIRED + date CDATA #IMPLIED> +<!ELEMENT rating EMPTY> +<!ATTLIST rating value (poor | avg | good | excellent) #REQUIRED> + +<!ELEMENT related (seealso*)> +<!ELEMENT seealso EMPTY> +<!ATTLIST seealso ref IDREF #REQUIRED> + +<!ELEMENT article (title, authorrefs, p+)> +<!ELEMENT authorrefs (authorref+)> +<!ELEMENT authorref EMPTY> +<!ATTLIST authorref ref IDREF #REQUIRED> + +<!-- Conditional section controlled by a parameter entity --> +<!ENTITY % longdocs "INCLUDE"> +<![%longdocs;[ + <!ELEMENT appendix (title, p+)> + <!ATTLIST appendix id ID #REQUIRED> +]]> +]> diff --git a/test/data/preserve.xml b/test/data/preserve.xml new file mode 100644 index 0000000..e77add1 --- /dev/null +++ b/test/data/preserve.xml @@ -0,0 +1,5 @@ +<?xml version="1.0" encoding="UTF-8"?> +<root xml:space="preserve"> + This node has preserved space + with <child xml:space="default"> default </child> children. +</root> diff --git a/test/runtests.jl b/test/runtests.jl index 89978eb..49632a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,646 +1,3208 @@ using XML -using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape, OrderedDict, h -using Downloads: download +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using XML: escape, unescape, h, parse_dtd +using XML: ParsedDTD, ElementDecl, AttDecl, EntityDecl, NotationDecl using Test -import AbstractTrees - -AbstractTrees.children(x::Node) = children(x) - -#-----------------------------------------------------------------------------# files -xml_xsd = joinpath("data", "xml.xsd") -kml_xsd = joinpath("data", "kml.xsd") -books_xml = joinpath("data", "books.xml") -example_kml = joinpath("data", "example.kml") -simple_dtd = joinpath("data", "simple_dtd.xml") - -all_files = [xml_xsd, kml_xsd, books_xml, example_kml, simple_dtd] - -#-----------------------------------------------------------------------------# h -@testset "h function" begin - @test h.tag == XML.Element("tag") - @test h.tag(id="id") == XML.Element("tag"; id="id") - @test h.tag(1, 2, a="a", b="b") == XML.Element("tag", 1, 2; a="a", b="b") -end - -#-----------------------------------------------------------------------------# escaping/unescaping -@testset "escaping/unescaping" begin - s = "This > string < has & some \" special ' characters" - @test escape(s) == "This > string < has & some " special ' characters" - @test escape(escape(s)) == escape(s) - @test s == unescape(escape(s)) - @test s == unescape(unescape(escape(s))) - - n = Element("tag", Text(s)) - @test XML.simple_value(n) == s - - XML.escape!(n) - @test XML.simple_value(n) == escape(s) - - XML.unescape!(n) - @test XML.simple_value(n) == s -end - -#-----------------------------------------------------------------------------# DTD -# @testset "DTDBody and friends" begin -# s = read(simple_dtd, String) -# data = read(simple_dtd) - -# dtd = XML.DTDBody(data) -# dtd2 = parse(s, XML.DTDBody) - -# @test length(dtd.elements) == length(dtd2.elements) == 0 -# @test length(dtd.attributes) == length(dtd2.attributes) == 0 -# @test length(dtd.entities) == length(dtd2.entities) == 3 - -# o = read("data/tv.dtd", XML.DTDBody) -# end - -#-----------------------------------------------------------------------------# Raw -@testset "Raw tag/attributes/value" begin - examples = [ - (xml = "<!DOCTYPE html>", - nodetype = DTD, - tag=nothing, - attributes=nothing, - value="html"), - (xml = "<?xml version=\"1.0\" key=\"value\"?>", - nodetype = Declaration, - tag=nothing, - attributes=Dict("version" => "1.0", "key" => "value"), - value=nothing), - (xml = "<tag _id=\"1\", x=\"abc\" />", - nodetype = Element, - tag="tag", - attributes=Dict("_id" => "1", "x" => "abc"), - value=nothing), - (xml = "<!-- comment -->", - nodetype = Comment, - tag=nothing, - attributes=nothing, - value=" comment "), - (xml = "<![CData[cdata test]]>", - nodetype = CData, - tag=nothing, - attributes=nothing, - value="cdata test"), - ] - for x in examples - # @info "Testing: $(x.xml)" - data = XML.next(XML.parse(x.xml, XML.Raw)) - @test XML.nodetype(data) == x.nodetype - @test XML.tag(data) == x.tag - @test XML.attributes(data) == x.attributes - @test XML.value(data) == x.value - end -end - -@testset "Raw with books.xml" begin - data = read(books_xml, XML.Raw) - doc = collect(data) - @test length(doc) > countlines(books_xml) - # Check that the first 5 lines are correct - first_5_lines = [ - XML.RawDeclaration => """<?xml version="1.0"?>""", - XML.RawElementOpen => "<catalog>", - XML.RawElementOpen => "<book id=\"bk101\">", - XML.RawElementOpen => "<author>", - XML.RawText => "Gambardella, Matthew" - ] - for (i, (typ, str)) in enumerate(first_5_lines) - dt = doc[i] - @test dt.type == typ - @test String(dt) == str - end - # Check that the last line is correct - @test doc[end].type == XML.RawElementClose - @test String(doc[end]) == "</catalog>" - - @testset "next and prev" begin - @test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx - @test prev(data) === nothing - @test XML.next(doc[end]) === nothing - - n = length(doc) - next_res = [doc[1]] - foreach(_ -> push!(next_res, XML.next(next_res[end])), 1:n-1) - - prev_res = [doc[end]] - foreach(_ -> pushfirst!(prev_res, XML.prev(prev_res[1])), 1:n-1) - - idx = findall(next_res .!= prev_res) - - for (a,b) in zip(next_res, prev_res) - @test a == b - end - - lzxml = """<root><text> </text><text2> hello </text2><text3 xml:space="preserve"> hello <text3b> preserve </text3b></text3><text4 xml:space="preserve"></text4><text5/></root>""" - lz = XML.parse(XML.LazyNode, lzxml) - n=XML.next(lz) - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text/>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text2>hello</text2>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text3b> preserve </text3b>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text4 xml:space=\"preserve\"/>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "<text5/>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text4 xml:space=\"preserve\"/>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text3b> preserve </text3b>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text2>hello</text2>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<text/>" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "<root>\n <text/>\n <text2>hello</text2>\n <text3 xml:space=\"preserve\"> hello <text3b> preserve </text3b></text3>\n <text4 xml:space=\"preserve\"/>\n <text5/>\n</root>" - end - - @testset "depth and parent" begin - @test XML.depth(data) == 0 - @test isnothing(XML.parent(data)) - @test XML.depth(doc[1]) == 1 - @test XML.parent(doc[1]) == data - @test XML.depth(doc[2]) == 1 - @test XML.depth(doc[3]) == 2 - @test XML.parent(doc[3]) == doc[2] - @test XML.depth(doc[end]) == 1 - @test XML.parent(doc[end]) == data - end - - @testset "tag/attributes/value" begin - x = doc[1] # <?xml version="1.0"?> - @test XML.tag(x) === nothing - @test XML.attributes(x) == Dict("version" => "1.0") - @test XML.value(x) === nothing - - x = XML.next(x) # <catalog> - @test XML.tag(x) == "catalog" - @test XML.attributes(x) === nothing - @test XML.value(x) === nothing - - x = XML.next(x) # <book id="bk101"> - @test XML.tag(x) == "book" - @test XML.attributes(x) == Dict("id" => "bk101") - @test XML.value(x) === nothing - - x = XML.next(x) # <author> - @test XML.tag(x) == "author" - @test XML.attributes(x) === nothing - @test XML.value(x) === nothing - - x = XML.next(x) # Gambardella, Matthew - @test XML.tag(x) === nothing - @test XML.attributes(x) === nothing - @test XML.value(x) == "Gambardella, Matthew" - end -end - -#-----------------------------------------------------------------------------# Preserve whitespace -@testset "xml:space" begin - @testset "Basic xml:space functionality" begin - - # Test 1: xml:space="preserve" should preserve entirely empty whitespace - xml1 = """<root><text xml:space="preserve"> </text></root>""" - doc1 = parse(XML.Node, xml1) - text_content = XML.value(doc1[1][1][1]) - @test text_content == " " - - # Test 2: xml:space="preserve" should preserve leading and trailing whitespace - xml2 = """<root><text xml:space="preserve"> leading and trailing spaces </text></root>""" - doc2 = parse(XML.Node, xml2) - text_content = XML.value(doc2[1][1][1]) - @test text_content == " leading and trailing spaces " - - # Test 3: Entirely empty tags with and without xml:space="preserve" become self-closing - xml3 = """<root><text> </text><text2 xml:space="preserve"> </text2><text3 xml:space="preserve"></text3><text4/></root>""" - doc3 = XML.parse(XML.Node, xml3) - text_content = XML.write(doc3[1][1]) - @test text_content == "<text/>" # without xml:space="preserve", empty text becomes self-closing - text_content = XML.value(doc3[1][2][1]) - @test text_content == " " # with xml:space, whitespace is preserved - text_content = XML.write(doc3[1][3]) - @test text_content == "<text3 xml:space=\"preserve\"/>" # with xml:space="preserve", empty text becomes self-closing - - # Test 4: Without xml:space, whitespace should be normalized - xml4 = """<root><text> gets normalized </text></root>""" - doc4 = XML.parse(XML.Node, xml4) - text_content = XML.value(doc4[1][1][1]) - @test text_content == "gets normalized" - - # Test 5: xml:space="default" should normalize even with preserve_xml_space=true - xml5 = """<root><text xml:space="default"> gets normalized </text></root>""" - doc5 = XML.parse(XML.Node, xml5) - text_content = XML.value(doc5[1][1][1]) - @test text_content == "gets normalized" - end - - @testset "xml:space inheritance" begin - # Test 6: Children inherit parent's xml:space="preserve" - xml6 = """<root xml:space="preserve"> - <parent> parent text - <child> child text </child> - </parent> - </root>""" - doc6 = XML.parse(XML.Node, xml6) - # Both parent and child should preserve whitespace - @test contains(XML.value(doc6[1][2][1]), "parent text \n") - @test XML.value(doc6[1][2][2][1]) == " child text " - - # Test 7: xml:space="default" overrides parent's "preserve" - xml7 = """<root xml:space="preserve"> - <child xml:space="default"> normalized despite parent </child> - </root>""" - doc7 = XML.parse(XML.Node, xml7) - @test XML.value(doc7[1][2][1]) == "normalized despite parent" - end - - @testset "Nesting scenarios" begin - # Test 8: Multiple levels of xml:space changes - xml8 = """<root xml:space="preserve"> - <level1> preserved - <level2 xml:space="default"> normalized - <level3 xml:space="preserve"> preserved again </level3> - </level2> - </level1> - </root>""" - doc8 = XML.parse(XML.Node, xml8) - - # level1 should preserve (inherits from root) - level1_text = XML.value(doc8[1][2][1]) - @test level1_text == " preserved \n " - - # level2 should normalize (explicit xml:space="default") - level2_text = XML.value(doc8[1][2][2][1]) - @test level2_text == "normalized" - - # level3 should preserve (explicit xml:space="preserve") - level3_text = XML.value(doc8[1][2][2][2][1]) - @test level3_text == " preserved again " - - # Test 9: repeated multiple levels of xml:space changes - xml9 = """<root xml:space="preserve"> - <level1> preserved - <level2 xml:space="default"> normalized - <level3 xml:space="preserve"> preserved again </level3> - </level2> - </level1> - <level1b> preserved b - <level2b xml:space="default"> normalized b - <level3b xml:space="preserve"> preserved again b </level3b> - </level2b> - </level1b> - </root>""" - doc9 = XML.parse(XML.Node, xml9) - - # level1b should preserve (inherits from root) - level1b_text = XML.value(doc9[1][4][1]) - @test level1b_text == " preserved b \n " - - # level2 should normalize (explicit xml:space="default") - level2b_text = XML.value(doc9[1][4][2][1]) - @test level2b_text == "normalized b" - - # level3 should preserve (explicit xml:space="preserve") - level3b_text = XML.value(doc9[1][4][2][2][1]) - @test level3b_text == " preserved again b " - - # Test 10: futher repeated multiple levels of xml:space changes - xml10 = """<root> - <level1> normalized - <level2> normalized b - <level3 xml:space="preserve"> preserved </level3> - </level2> - </level1> - <level1b> normalized c - <level2b xml:space="preserve"> preserved b - <level3b xml:space="default"> normalized again b </level3b> - <level3c> preserved c - </level3c> - </level2b> - </level1b> - <level1c> normalized d </level1c> - </root>""" - doc10 = XML.parse(XML.Node, xml10) - - # level1 should normalize (as root) - level1_text = XML.value(doc10[end][1][1]) - @test level1_text == "normalized" - - # level2 should normalize (as root and level1) - level2_text = XML.value(doc10[end][1][2][1]) - @test level2_text == "normalized b" - - # level3 should preserve (explicit xml:space="preserve") - level3_text = XML.value(doc10[end][1][2][2][1]) - @test level3_text == " preserved " - - # level1b should normalize (as root) - level1b_text = XML.value(doc10[end][2][1]) - @test level1b_text == "normalized c" - - # level2b should preserve (explicit xml:space="preserve") - level2b_text = XML.value(doc10[end][2][2][1]) - @test level2b_text == " preserved b \n " - - # level3 should normalize (explicit xml:space="default") - level3b_text = XML.value(doc10[end][2][2][2][1]) - @test level3b_text == "normalized again b" - - # level3c should preserve (inherited from level2b) - level3c_text = XML.value(doc10[end][2][2][4][1]) - @test level3c_text == " preserved c \n " - - # level1c should normalize (as root) - level1c_text = XML.value(doc10[end][3][1]) - @test level1c_text == "normalized d" - end - @testset "inter-element gap semantics" begin - # Default parent: gap between siblings should be dropped - s1 = """<root><a> x </a> - <b> y </b></root>""" - d1 = XML.parse(XML.Node, s1) - @test length(d1[1]) == 2 - @test XML.value(d1[1][1][1]) == "x" - @test XML.value(d1[1][2][1]) == "y" - - # Preserve parent, default child ends: gap after default child dropped - s2 = """<root xml:space="preserve"> - <p> keep </p> - <q xml:space="default"> norm </q> - <r> after default gap </r> - </root>""" - d2 = XML.parse(XML.Node, s2) - @test length(d2[1]) == 7 - @test XML.value(d2[1][1]) == "\n " - @test XML.value(d2[1][2][1]) == " keep " - @test XML.value(d2[1][3]) == "\n " - @test XML.value(d2[1][4][1]) == "norm" - @test XML.value(d2[1][5]) == "\n " - @test XML.value(d2[1][6][1]) == " after default gap " - @test XML.value(d2[1][7]) == "\n" - end - @testset "XML whitespace vs Unicode whitespace" begin + +#==============================================================================# +# ESCAPE / UNESCAPE # +#==============================================================================# +@testset "escape / unescape" begin + @testset "all five predefined entities" begin + @test escape("&") == "&" + @test escape("<") == "<" + @test escape(">") == ">" + @test escape("'") == "'" + @test escape("\"") == """ + end + + @testset "unescape reverses escape" begin + @test unescape("&") == "&" + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("'") == "'" + @test unescape(""") == "\"" + end + + @testset "roundtrip on mixed strings" begin + s = "This > string < has & some \" special ' characters" + @test unescape(escape(s)) == s + end + + @testset "idempotent unescape" begin + s = "plain text with no entities" + @test unescape(s) == s + end + + @testset "multiple entities in one string" begin + @test escape("a < b & c > d") == "a < b & c > d" + @test unescape("a < b & c > d") == "a < b & c > d" + end + + @testset "empty string" begin + @test escape("") == "" + @test unescape("") == "" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.1: Well-Formed XML Documents # +#==============================================================================# +@testset "Spec 2.1: Well-Formed XML Documents" begin + # The spec's simplest example: + # <?xml version="1.0"?> + # <greeting>Hello, world!</greeting> + xml = """<?xml version="1.0"?><greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + @test length(doc) == 2 # Declaration + Element + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + @test tag(doc[2]) == "greeting" + @test simple_value(doc[2]) == "Hello, world!" +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.4: Character Data and Markup # +#==============================================================================# +@testset "Spec 2.4: Character Data and Markup" begin + @testset "text content between tags" begin + doc = parse("<root>Hello</root>", Node) + @test simple_value(doc[1]) == "Hello" + end + + @testset "entity references in text are unescaped" begin + doc = parse("<root>& < > ' "</root>", Node) + @test simple_value(doc[1]) == "& < > ' \"" + end + + @testset "mixed text and child elements" begin + doc = parse("<p>Hello <b>world</b>!</p>", Node) + root = doc[1] + @test length(root) == 3 + @test nodetype(root[1]) == Text + @test value(root[1]) == "Hello " + @test nodetype(root[2]) == Element + @test tag(root[2]) == "b" + @test simple_value(root[2]) == "world" + @test nodetype(root[3]) == Text + @test value(root[3]) == "!" + end + + @testset "empty element has no text" begin + doc = parse("<empty/>", Node) + @test length(children(doc[1])) == 0 + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.5: Comments # +#==============================================================================# +@testset "Spec 2.5: Comments" begin + @testset "basic comment (spec example)" begin + # Spec example: <!-- declarations for <head> & <body> --> + doc = parse("<root><!-- declarations for <head> & <body> --></root>", Node) + c = doc[1][1] + @test nodetype(c) == Comment + @test value(c) == " declarations for <head> & <body> " + end + + @testset "empty comment" begin + doc = parse("<root><!----></root>", Node) + c = doc[1][1] + @test nodetype(c) == Comment + @test value(c) == "" + end + + @testset "comment before root element" begin + doc = parse("<!-- before --><root/>", Node) + @test nodetype(doc[1]) == Comment + @test value(doc[1]) == " before " + @test nodetype(doc[2]) == Element + end + + @testset "comment after root element" begin + doc = parse("<root/><!-- after -->", Node) + @test nodetype(doc[1]) == Element + @test nodetype(doc[2]) == Comment + end + + @testset "comment with markup-like content preserved verbatim" begin + doc = parse("<root><!-- <b>not</b> a tag --></root>", Node) + @test value(doc[1][1]) == " <b>not</b> a tag " + end + + @testset "multiple comments" begin + doc = parse("<root><!-- A --><!-- B --></root>", Node) + @test length(doc[1]) == 2 + @test value(doc[1][1]) == " A " + @test value(doc[1][2]) == " B " + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.6: Processing Instructions # +#==============================================================================# +@testset "Spec 2.6: Processing Instructions" begin + @testset "xml-stylesheet PI (spec example)" begin + doc = parse("""<?xml-stylesheet type="text/xsl" href="style.xsl"?><root/>""", Node) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "xml-stylesheet" + @test contains(value(pi), "type=\"text/xsl\"") + end + + @testset "PI with no content" begin + doc = parse("<?target?><root/>", Node) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "PI inside element" begin + doc = parse("<root><?mypi some data?></root>", Node) + pi = doc[1][1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "mypi" + @test value(pi) == "some data" + end + + @testset "PI after root element" begin + doc = parse("<root/><?post-process?>", Node) + @test nodetype(doc[2]) == ProcessingInstruction + @test tag(doc[2]) == "post-process" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.7: CDATA Sections # +#==============================================================================# +@testset "Spec 2.7: CDATA Sections" begin + @testset "CDATA preserves markup characters" begin + # Spec example + doc = parse("<root><![CDATA[<greeting>Hello, world!</greeting>]]></root>", Node) + cd = doc[1][1] + @test nodetype(cd) == CData + @test value(cd) == "<greeting>Hello, world!</greeting>" + end + + @testset "empty CDATA" begin + doc = parse("<root><![CDATA[]]></root>", Node) + cd = doc[1][1] + @test nodetype(cd) == CData + @test value(cd) == "" + end + + @testset "CDATA with ampersands and less-thans" begin + doc = parse("<root><![CDATA[a < b && c > d]]></root>", Node) + @test value(doc[1][1]) == "a < b && c > d" + end + + @testset "CDATA with special characters" begin + doc = parse("<root><![CDATA[line1\nline2\ttab]]></root>", Node) + @test value(doc[1][1]) == "line1\nline2\ttab" + end + + @testset "CDATA mixed with text" begin + doc = parse("<root>before<![CDATA[inside]]>after</root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][1]) == Text + @test value(doc[1][1]) == "before" + @test nodetype(doc[1][2]) == CData + @test value(doc[1][2]) == "inside" + @test nodetype(doc[1][3]) == Text + @test value(doc[1][3]) == "after" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.8: Prolog and Document Type Declaration # +#==============================================================================# +@testset "Spec 2.8: Prolog and Document Type Declaration" begin + @testset "XML declaration - version only" begin + doc = parse("""<?xml version="1.0"?><root/>""", Node) + decl = doc[1] + @test nodetype(decl) == Declaration + @test decl["version"] == "1.0" + end + + @testset "XML declaration - version and encoding" begin + doc = parse("""<?xml version="1.0" encoding="UTF-8"?><root/>""", Node) + decl = doc[1] + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + end + + @testset "XML declaration - all three pseudo-attributes" begin + doc = parse("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root/>""", Node) + decl = doc[1] + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + @test decl["standalone"] == "yes" + end + + @testset "XML declaration with single quotes" begin + doc = parse("<?xml version='1.0'?><root/>", Node) + @test doc[1]["version"] == "1.0" + end + + @testset "no XML declaration" begin + doc = parse("<root/>", Node) + @test length(doc) == 1 + @test nodetype(doc[1]) == Element + end + + @testset "DOCTYPE - SYSTEM" begin + # Spec example + doc = parse("""<!DOCTYPE greeting SYSTEM "hello.dtd"><greeting/>""", Node) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + @test contains(value(dtd), "SYSTEM") + @test contains(value(dtd), "hello.dtd") + end + + @testset "DOCTYPE - with internal subset" begin + xml = """<!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> +]><greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + @test contains(value(dtd), "<!ELEMENT") + end + + @testset "DOCTYPE with entities (spec-like)" begin + xml = """<!DOCTYPE note [ +<!ENTITY nbsp " "> +<!ENTITY writer "Writer: Donald Duck."> +<!ENTITY copyright "Copyright: W3Schools."> +]><note/>""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == DTD + @test contains(value(doc[1]), "ENTITY") + end + + @testset "full prolog: declaration + DOCTYPE" begin + xml = """<?xml version="1.0"?><!DOCTYPE root SYSTEM "root.dtd"><root/>""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == DTD + @test nodetype(doc[3]) == Element + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.9: Standalone Document Declaration # +#==============================================================================# +@testset "Spec 2.9: Standalone Document Declaration" begin + doc = parse("""<?xml version="1.0" standalone="yes"?><root/>""", Node) + @test doc[1]["standalone"] == "yes" + + doc2 = parse("""<?xml version="1.0" standalone="no"?><root/>""", Node) + @test doc2[1]["standalone"] == "no" +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.10: White Space Handling # +#==============================================================================# +@testset "Spec 2.10: White Space Handling" begin + @testset "parser preserves all text content verbatim" begin + doc = parse("<root> hello </root>", Node) + @test simple_value(doc[1]) == " hello " + end + + @testset "parser preserves whitespace-only text" begin + doc = parse("<root> </root>", Node) + @test simple_value(doc[1]) == " " + end + + @testset "parser preserves inter-element whitespace as Text nodes" begin + xml = "<root><a>x</a>\n <b>y</b></root>" + doc = parse(xml, Node) + @test length(doc[1]) == 3 + @test value(doc[1][1][1]) == "x" + @test nodetype(doc[1][2]) == Text + @test value(doc[1][2]) == "\n " + @test value(doc[1][3][1]) == "y" + end + + @testset "xml:space attribute is preserved during parsing" begin + doc = parse("""<root xml:space="preserve"><child> text </child></root>""", Node) + @test doc[1]["xml:space"] == "preserve" + @test value(doc[1][1][1]) == " text " + end + + @testset "xml:space='preserve' affects write formatting" begin + # When xml:space="preserve", writer doesn't add indentation + el = Element("s", XML.Text(" pre "), Element("t"), XML.Text(" post "); var"xml:space"="preserve") + @test XML.write(el) == "<s xml:space=\"preserve\"> pre <t/> post </s>" + end + + @testset "write formats with indentation by default" begin + el = Element("root", Element("a"), Element("b")) + s = XML.write(el) + @test contains(s, " <a/>") # indented + @test contains(s, " <b/>") # indented + end + + @testset "Unicode non-breaking space is NOT XML whitespace" begin nbsp = "\u00A0" - s = """<root> - <a> x\t\n </a> - <b>$(nbsp) y $(nbsp)</b> - <c xml:space="default">$(nbsp) z $(nbsp)</c> - </root>""" - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "x" - @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)" - @test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)" - end - - @testset "CDATA/Comment/PI boundaries" begin - s = """<root> - <a xml:space="default"> pre <![CDATA[ mid ]]> post </a> - <b xml:space="preserve"> pre <!-- cmt --> post </b> - <?xml-stylesheet type="text/css" href="style.css"?> - </root>""" - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "pre" - @test nodetype(d[1][1][2]) == XML.CData - @test XML.value(d[1][1][3]) == "post" - @test XML.value(d[1][2][1]) == " pre " - @test nodetype(d[1][2][2]) == XML.Comment - @test XML.value(d[1][2][3]) == " post " - @test nodetype(d[1][3]) == XML.ProcessingInstruction - end - - @testset "nested toggles and sibling sequences" begin - s = """<root xml:space="preserve"> - <x> a - <y xml:space="default"> b - <z xml:space="preserve"> c </z> - </y> - <y2 xml:space="default"> d </y2> - <w> e </w> - </x> - </root>""" - d = XML.parse(XML.Node, s) - @test XML.value(d[1][2][1]) == " a \n " - @test XML.value(d[1][2][2][1]) == "b" - @test XML.value(d[1][2][2][2][1]) == " c " - @test d[1][2][4].tag == "y2" - @test XML.value(d[1][2][4][1]) == "d" - @test d[1][2][6].tag == "w" - @test XML.value(d[1][2][6][1]) == " e " - end - - @testset "root/document boundaries" begin - s = "\n \n<root> a </root>\n \t " - d = XML.parse(XML.Node, s) - @test length(d) == 1 - @test XML.value(d[1][1]) == "a" - end - - @testset "entities expanding to whitespace" begin - chr1="\u0020" - chr2="\u000A" - chr3="\u00A0" - - s = """<root> - <a> $(chr1) a $(chr2) </a> - <b xml:space="preserve">$(chr1) b $(chr2)</b> - <c>$(chr3)c$(chr3)</c> - </root>""" - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "a" - @test XML.value(d[1][2][1]) == " b \n" - @test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)" - end - - @testset "invalid values and placement" begin - s_bad = """<root><x xml:space="weird"> t </x></root>""" - @test_throws ErrorException XML.parse(XML.Node, s_bad) - - s_pi = """<?pi xml:space="preserve"?><root> t </root>""" - d = XML.parse(XML.Node, s_pi) - @test XML.value(d[end][1]) == "t" - - s_dup = """<root><x xml:space="preserve" xml:space="default"> t </x></root>""" -# @test_throws ErrorException XML.parse(XML.Node, s_dup) - end - - @testset "prev()/next() symmetry" begin - xml = """<root xml:space="preserve"> - <a> a <b xml:space="default"> b </b> <c> c </c> </a> - <d xml:space="default"> d <e xml:space="preserve"> e </e> f </d> - <g><h/><i xml:space="preserve"> i </i><j/></g> - </root>""" - r = XML.parse(XML.LazyNode, xml).raw - toks=XML.Raw[] - while true - n = XML.next(r) - n === nothing && break - push!(toks, n) - r=n - end - back = XML.Raw[] - r = toks[end] - while true - p = XML.prev(r) - p === nothing && break - push!(back, p) - r = p - end - @test reverse(back)[2:end] == toks[1:end-1] - end - - @testset "write/read roundtrip extremes" begin - xml = """<root> - <p xml:space="preserve"> </p> - <q> </q> - <r xml:space="default"> r </r> - <s xml:space="preserve"> pre <t/> post </s> - </root>""" - n = XML.parse(XML.Node, xml) - io = IOBuffer(); XML.write(io, n) - n2 = XML.parse(XML.Node, String(take!(io))) - @test n == n2 - @test XML.write(n2[1][1]) == "<p xml:space=\"preserve\"> </p>" - @test XML.write(n2[1][2]) == "<q/>" - @test XML.value(n2[1][3][1]) == "r" - @test XML.write(n2[1][4]) == "<s xml:space=\"preserve\"> pre <t/> post </s>" - end - - @testset "self-closing/empty/whitespace-only children" begin - s = """<root> - <a xml:space="default"> </a> - <b xml:space="preserve"></b> - <c xml:space="preserve"> </c> - <d><e/></d> - <f> x <g/> y </f> - </root>""" - d = XML.parse(XML.Node, s) - @test XML.write(d[1][1]) == "<a xml:space=\"default\"/>" - @test XML.write(d[1][2]) == "<b xml:space=\"preserve\"/>" - @test XML.value(d[1][3][1]) == " " - @test XML.value(d[1][5][1]) == "x" - @test XML.value(d[1][5][3]) == "y" - end - - @testset "allocation guard: small xml:space doc" begin - xml = "<root><a xml:space=\"default\"> x </a><b xml:space=\"preserve\"> y </b></root>" - f() = XML.parse(XML.Node, xml) - a = @allocated f() - @test a < 500_000 # tune for CI - end - -end - -#-----------------------------------------------------------------------------# roundtrip -@testset "read/write/read roundtrip" begin - for path in all_files - node = read(path, Node) - temp = tempname() * ".xml" - XML.write(temp, node) - node2 = read(temp, Node) - @test node == node2 + xml = "<root>$(nbsp) y $(nbsp)</root>" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "$(nbsp) y $(nbsp)" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 3.1: Start-Tags, End-Tags, Empty-Element Tags # +#==============================================================================# +@testset "Spec 3.1: Start-Tags, End-Tags, Empty-Element Tags" begin + @testset "element with attributes (spec example)" begin + # <termdef id="dt-dog" term="dog"> + doc = parse("""<termdef id="dt-dog" term="dog">A dog.</termdef>""", Node) + el = doc[1] + @test tag(el) == "termdef" + @test el["id"] == "dt-dog" + @test el["term"] == "dog" + @test value(el[1]) == "A dog." + end + + @testset "self-closing tag (spec example)" begin + # <IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/> + doc = parse("""<IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/>""", Node) + el = doc[1] + @test tag(el) == "IMG" + @test el["align"] == "left" + @test el["src"] == "http://www.w3.org/Icons/WWW/w3c_home" + @test length(children(el)) == 0 + end + + @testset "simple self-closing tag" begin + doc = parse("<br/>", Node) + @test tag(doc[1]) == "br" + @test length(children(doc[1])) == 0 + end + + @testset "self-closing tag with space before />" begin + doc = parse("<br />", Node) + @test tag(doc[1]) == "br" + end + + @testset "empty element with start and end tag" begin + doc = parse("<empty></empty>", Node) + el = doc[1] + @test tag(el) == "empty" + @test isnothing(el.children) + end + + @testset "nested elements" begin + doc = parse("<a><b><c/></b></a>", Node) + @test tag(doc[1]) == "a" + @test tag(doc[1][1]) == "b" + @test tag(doc[1][1][1]) == "c" + end + + @testset "sibling elements" begin + doc = parse("<root><a/><b/><c/></root>", Node) + @test length(doc[1]) == 3 + @test tag(doc[1][1]) == "a" + @test tag(doc[1][2]) == "b" + @test tag(doc[1][3]) == "c" + end + + @testset "attributes with single quotes" begin + doc = parse("<x a='val'/>", Node) + @test doc[1]["a"] == "val" + end + + @testset "attributes with double quotes" begin + doc = parse("""<x a="val"/>""", Node) + @test doc[1]["a"] == "val" + end + + @testset "mixed quote styles in attributes" begin + doc = parse("""<x a="1" b='2'/>""", Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "attribute with > in value" begin + doc = parse("""<x a="1>2"/>""", Node) + @test doc[1]["a"] == "1>2" + end + + @testset "attribute with entity reference" begin + doc = parse("""<x a="a&b"/>""", Node) + @test doc[1]["a"] == "a&b" + end + + @testset "multiple attributes accessible via attributes()" begin + doc = parse("""<x first="1" second="2" third="3"/>""", Node) + attrs = attributes(doc[1]) + @test attrs isa Attributes + @test attrs["first"] == "1" + @test attrs["second"] == "2" + @test attrs["third"] == "3" + end - #For debugging: - for (a,b) in zip(AbstractTrees.Leaves(node), AbstractTrees.Leaves(node2)) - if a != b - @info path - @info a - @info b - error() - end - end + @testset "whitespace around = in attributes" begin + doc = parse("""<x a = "1" />""", Node) + @test doc[1]["a"] == "1" end end -#-----------------------------------------------------------------------------# Node writing -@testset "Node writing" begin - doc = Document( - DTD("root_tag"), - Declaration(version=1.0), - Comment("comment"), - ProcessingInstruction("xml-stylesheet", href="mystyle.css", type="text/css"), - Element("root_tag", CData("cdata"), Text("text")) - ) - @test map(nodetype, children(doc)) == [DTD,Declaration,Comment,ProcessingInstruction,Element] - @test length(children(doc[end])) == 2 - @test nodetype(doc[end][1]) == XML.CData - @test nodetype(doc[end][2]) == XML.Text - @test value(doc[end][1]) == "cdata" - @test value(doc[end][2]) == "text" - - #set/get index for attributes - o = doc[end] - @test isempty(keys(o)) - o["id"] = 1 - @test o["id"] == "1" - @test keys(o) == keys(Dict("id" => "1")) -end - -#-----------------------------------------------------------------------------# Issues -@testset "Issues" begin - # https://github.com/JuliaComputing/XML.jl/issues/12: DTD content was cut short - s = """ - <!DOCTYPE note [ - <!ENTITY nbsp " "> - <!ENTITY writer "Writer: Donald Duck."> - <!ENTITY copyright "Copyright: W3Schools."> - ]> - """ - - doc = parse(Node, s) - @test value(only(doc)) == s[11:end-2] # note [...] - - # https://github.com/JuliaComputing/XML.jl/issues/14 (Sorted Attributes) - kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z')) - xyz = XML.Element("point"; kw...) - @test collect(keys(attributes(xyz))) == string.(collect('a':'z')) +#==============================================================================# +# XML 1.0 SPEC SECTION 4.1: Entity References # +#==============================================================================# +@testset "Spec 4.1: Character and Entity References" begin + @testset "predefined entity references in text" begin + doc = parse("<root><</root>", Node) + @test simple_value(doc[1]) == "<" + + doc = parse("<root>></root>", Node) + @test simple_value(doc[1]) == ">" + + doc = parse("<root>&</root>", Node) + @test simple_value(doc[1]) == "&" + + doc = parse("<root>'</root>", Node) + @test simple_value(doc[1]) == "'" + + doc = parse("<root>"</root>", Node) + @test simple_value(doc[1]) == "\"" + end + + @testset "predefined entities in attribute values" begin + doc = parse("""<x a="<>&'""/>""", Node) + @test doc[1]["a"] == "<>&'\"" + end + + @testset "multiple entity references in one text node" begin + doc = parse("<root><tag> & "value"</root>", Node) + @test simple_value(doc[1]) == "<tag> & \"value\"" + end +end + +#==============================================================================# +# NAMESPACES (Colon in Tag and Attribute Names) # +#==============================================================================# +@testset "Namespaces" begin + @testset "namespaced element" begin + doc = parse("""<ns:root xmlns:ns="http://example.com"><ns:child/></ns:root>""", Node) + @test tag(doc[1]) == "ns:root" + @test doc[1]["xmlns:ns"] == "http://example.com" + @test tag(doc[1][1]) == "ns:child" + end + + @testset "default namespace" begin + doc = parse("""<root xmlns="http://example.com"/>""", Node) + @test doc[1]["xmlns"] == "http://example.com" + end + + @testset "multiple namespace prefixes" begin + xml = """<root xmlns:a="http://a.com" xmlns:b="http://b.com"><a:x/><b:y/></root>""" + doc = parse(xml, Node) + @test tag(doc[1][1]) == "a:x" + @test tag(doc[1][2]) == "b:y" + end +end + +#==============================================================================# +# NODE CONSTRUCTORS # +#==============================================================================# +@testset "Node Constructors" begin + @testset "Text" begin + t = Text("hello") + @test nodetype(t) == Text + @test value(t) == "hello" + @test tag(t) === nothing + @test attributes(t) === nothing + end + + @testset "Comment" begin + c = Comment(" a comment ") + @test nodetype(c) == Comment + @test value(c) == " a comment " + end + + @testset "CData" begin + cd = CData("raw <data>") + @test nodetype(cd) == CData + @test value(cd) == "raw <data>" + end + + @testset "DTD" begin + d = DTD("html") + @test nodetype(d) == DTD + @test value(d) == "html" + end + + @testset "Declaration" begin + decl = Declaration(; version="1.0", encoding="UTF-8") + @test nodetype(decl) == Declaration + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + end + + @testset "Declaration with no attributes" begin + decl = Declaration() + @test nodetype(decl) == Declaration + @test attributes(decl) === nothing + end + + @testset "ProcessingInstruction with content" begin + pi = ProcessingInstruction("target", "data here") + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) == "data here" + end + + @testset "ProcessingInstruction without content" begin + pi = ProcessingInstruction("target") + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "Element with tag only" begin + el = Element("div") + @test nodetype(el) == Element + @test tag(el) == "div" + @test length(children(el)) == 0 + end + + @testset "Element with children" begin + el = Element("div", Text("hello"), Element("span")) + @test length(el) == 2 + @test nodetype(el[1]) == Text + @test nodetype(el[2]) == Element + end + + @testset "Element with attributes" begin + el = Element("div"; class="main", id="content") + @test el["class"] == "main" + @test el["id"] == "content" + end + + @testset "Element with children and attributes" begin + el = Element("a", "click here"; href="http://example.com") + @test tag(el) == "a" + @test el["href"] == "http://example.com" + @test value(el[1]) == "click here" + end + + @testset "Element auto-converts non-Node children to Text" begin + el = Element("p", 42) + @test nodetype(el[1]) == Text + @test value(el[1]) == "42" + end + + @testset "Document" begin + doc = Document( + Declaration(; version="1.0"), + Element("root") + ) + @test nodetype(doc) == Document + @test length(doc) == 2 + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + end + + @testset "Document with all node types" begin + doc = Document( + Declaration(; version="1.0"), + DTD("root"), + Comment("comment"), + ProcessingInstruction("pi", "data"), + Element("root", CData("cdata"), Text("text")) + ) + @test map(nodetype, children(doc)) == [Declaration, DTD, Comment, ProcessingInstruction, Element] + @test length(doc[end]) == 2 + @test nodetype(doc[end][1]) == CData + @test value(doc[end][1]) == "cdata" + @test nodetype(doc[end][2]) == Text + @test value(doc[end][2]) == "text" + end + + @testset "invalid constructions" begin + @test_throws Exception Text("a", "b") # too many args + @test_throws Exception Comment("a"; x="1") # no attrs + @test_throws Exception CData("a"; x="1") # no attrs + @test_throws Exception DTD("a"; x="1") # no attrs + @test_throws Exception Element() # need tag + @test_throws Exception Declaration("bad") # no positional args + @test_throws Exception Document(; x="1") # no attrs + @test_throws Exception ProcessingInstruction() # need target + @test_throws Exception ProcessingInstruction("a", "b", "c") # too many args + end +end + +#==============================================================================# +# h CONSTRUCTOR # +#==============================================================================# +@testset "h constructor" begin + @testset "h(tag)" begin + el = h("div") + @test nodetype(el) == Element + @test tag(el) == "div" + end + + @testset "h(tag, children...)" begin + el = h("div", "hello") + @test simple_value(el) == "hello" + end + + @testset "h(tag; attrs...)" begin + el = h("div"; class="main") + @test el["class"] == "main" + end + + @testset "h(tag, children...; attrs...)" begin + el = h("div", "hello"; class="main") + @test el["class"] == "main" + @test value(el[1]) == "hello" + end + + @testset "h.tag syntax" begin + el = h.div("hello"; class="main") + @test tag(el) == "div" + @test el["class"] == "main" + @test value(el[1]) == "hello" + end + + @testset "h.tag with no args" begin + el = h.br() + @test tag(el) == "br" + @test length(children(el)) == 0 + end + + @testset "h.tag with only attrs" begin + el = h.img(; src="image.png") + @test tag(el) == "img" + @test el["src"] == "image.png" + end + + @testset "nested h constructors" begin + el = h.div( + h.h1("Title"), + h.p("Paragraph") + ) + @test tag(el) == "div" + @test length(el) == 2 + @test tag(el[1]) == "h1" + @test tag(el[2]) == "p" + end + + @testset "h with symbol tag" begin + el = h(:div) + @test tag(el) == "div" + end +end + +#==============================================================================# +# NODE INTERFACE # +#==============================================================================# +@testset "Node Interface" begin + doc = parse("""<?xml version="1.0"?><root attr="val"><child>text</child></root>""", Node) + + @testset "nodetype" begin + @test nodetype(doc) == Document + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + end + + @testset "tag" begin + @test tag(doc) === nothing + @test tag(doc[2]) == "root" + @test tag(doc[2][1]) == "child" + end + + @testset "attributes" begin + @test attributes(doc) === nothing + @test attributes(doc[2])["attr"] == "val" + end + + @testset "value" begin + @test value(doc) === nothing + @test value(doc[2][1][1]) == "text" + end + + @testset "children" begin + @test length(children(doc)) == 2 + @test length(children(doc[2])) == 1 + end + + @testset "is_simple" begin + @test is_simple(doc[2][1]) == true + @test is_simple(doc[2]) == false + end + + @testset "simple_value" begin + @test simple_value(doc[2][1]) == "text" + @test_throws ErrorException simple_value(doc[2]) + end + + @testset "simple_value for CData child" begin + el = Element("x", CData("data")) + @test is_simple(el) + @test simple_value(el) == "data" + end +end + +#==============================================================================# +# NODE INDEXING # +#==============================================================================# +@testset "Node Indexing" begin + doc = parse("<root><a/><b/><c/></root>", Node) + root = doc[1] + + @testset "integer indexing" begin + @test tag(root[1]) == "a" + @test tag(root[2]) == "b" + @test tag(root[3]) == "c" + end + + @testset "colon indexing" begin + all = root[:] + @test length(all) == 3 + end + + @testset "lastindex" begin + @test tag(root[end]) == "c" + end + + @testset "only" begin + single = parse("<root><only/></root>", Node) + @test tag(only(single[1])) == "only" + end + + @testset "length" begin + @test length(root) == 3 + end + + @testset "attribute indexing" begin + el = parse("""<x a="1" b="2"/>""", Node)[1] + @test el["a"] == "1" + @test el["b"] == "2" + @test_throws KeyError el["nonexistent"] + end + + @testset "haskey" begin + el = parse("""<x a="1"/>""", Node)[1] + @test haskey(el, "a") == true + @test haskey(el, "b") == false + end + + @testset "keys" begin + el = parse("""<x a="1" b="2"/>""", Node)[1] + @test collect(keys(el)) == ["a", "b"] + end + + @testset "keys on element with no attributes" begin + el = parse("<x/>", Node)[1] + @test isempty(keys(el)) + end +end + +#==============================================================================# +# NODE MUTATION # +#==============================================================================# +@testset "Node Mutation" begin + @testset "setindex! child" begin + el = Element("root", Element("old")) + el[1] = Element("new") + @test tag(el[1]) == "new" + end + + @testset "setindex! child with auto-conversion" begin + el = Element("root", Text("old")) + el[1] = "new text" + @test value(el[1]) == "new text" + end + + @testset "setindex! attribute" begin + el = Element("root"; a="1") + el["a"] = "2" + @test el["a"] == "2" + end + + @testset "setindex! new attribute" begin + el = Element("root"; a="1") + el["b"] = "2" + @test el["b"] == "2" + end + + @testset "push! child" begin + el = Element("root") + push!(el, Element("child")) + @test length(el) == 1 + @test tag(el[1]) == "child" + end + + @testset "push! with auto-conversion" begin + el = Element("root") + push!(el, "text") + @test nodetype(el[1]) == Text + @test value(el[1]) == "text" + end + + @testset "pushfirst! child" begin + el = Element("root", Element("second")) + pushfirst!(el, Element("first")) + @test tag(el[1]) == "first" + @test tag(el[2]) == "second" + end + + @testset "push! on non-container node errors" begin + t = Text("hello") + @test_throws ErrorException push!(t, "more") + end +end + +#==============================================================================# +# NODE EQUALITY # +#==============================================================================# +@testset "Node Equality" begin + @testset "identical elements are equal" begin + a = Element("div", Text("hello"); class="main") + b = Element("div", Text("hello"); class="main") + @test a == b + end + + @testset "different tag names are not equal" begin + @test Element("a") != Element("b") + end + + @testset "different attributes are not equal" begin + @test Element("a"; x="1") != Element("a"; x="2") + end + + @testset "different children are not equal" begin + @test Element("a", Text("x")) != Element("a", Text("y")) + end + + @testset "different node types are not equal" begin + @test Text("x") != Comment("x") + end + + @testset "empty attributes vs nothing" begin + a = Element("a") + b = Element("a") + @test a == b + end + + @testset "parse equality" begin + xml = "<root><child>text</child></root>" + @test parse(xml, Node) == parse(xml, Node) + end +end + +#==============================================================================# +# XML WRITING # +#==============================================================================# +@testset "XML Writing" begin + @testset "write Text" begin + el = Element("p", "hello & goodbye") + @test XML.write(el) == "<p>hello & goodbye</p>" + end + + @testset "write Element with attributes" begin + el = Element("div"; class="main", id="content") + s = XML.write(el) + @test contains(s, "<div") + @test contains(s, "class=\"main\"") + @test contains(s, "id=\"content\"") + @test contains(s, "/>") + end + + @testset "write self-closing element" begin + @test XML.write(Element("br")) == "<br/>" + end + + @testset "write element with single text child (inline)" begin + @test XML.write(Element("p", "hello")) == "<p>hello</p>" + end + + @testset "write element with multiple children (indented)" begin + el = Element("div", Element("a"), Element("b")) + s = XML.write(el) + @test contains(s, "<div>") + @test contains(s, " <a/>") + @test contains(s, " <b/>") + @test contains(s, "</div>") + end + + @testset "write Comment" begin + el = Element("root", Comment(" comment ")) + @test contains(XML.write(el), "<!-- comment -->") + end + + @testset "write CData" begin + el = Element("root", CData("raw <data>")) + @test contains(XML.write(el), "<![CDATA[raw <data>]]>") + end + + @testset "write ProcessingInstruction with content" begin + pi = ProcessingInstruction("target", "data") + @test XML.write(pi) == "<?target data?>" + end + + @testset "write ProcessingInstruction without content" begin + pi = ProcessingInstruction("target") + @test XML.write(pi) == "<?target?>" + end + + @testset "write Declaration" begin + decl = Declaration(; version="1.0", encoding="UTF-8") + s = XML.write(decl) + @test contains(s, "<?xml") + @test contains(s, "version=\"1.0\"") + @test contains(s, "encoding=\"UTF-8\"") + @test contains(s, "?>") + end + + @testset "write DTD" begin + dtd = DTD("html") + @test XML.write(dtd) == "<!DOCTYPE html>" + end + + @testset "write Document" begin + doc = Document(Declaration(; version="1.0"), Element("root")) + s = XML.write(doc) + @test startswith(s, "<?xml") + @test contains(s, "<root/>") + end + + @testset "write escapes special characters in text" begin + el = Element("p", "a < b & c > d") + @test XML.write(el) == "<p>a < b & c > d</p>" + end + + @testset "write escapes special characters in attribute values" begin + el = Element("x"; a="a\"b") + @test contains(XML.write(el), "a=\"a"b\"") + end + + @testset "indentsize parameter" begin + el = Element("root", Element("child")) + s2 = XML.write(el; indentsize=2) + s4 = XML.write(el; indentsize=4) + @test contains(s2, " <child/>") + @test contains(s4, " <child/>") + end + + @testset "write xml:space='preserve' respects whitespace" begin + el = Element("root", Element("p", Text(" hello "); var"xml:space"="preserve")) + s = XML.write(el) + @test contains(s, "> hello </p>") + end +end + +#==============================================================================# +# WRITE TO FILE / READ FROM FILE # +#==============================================================================# +@testset "File I/O" begin + @testset "write and read back" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", Element("child", "text")) + ) + temp = tempname() * ".xml" + XML.write(temp, doc) + content = read(temp, String) + @test contains(content, "<?xml") + @test contains(content, "<root>") + @test contains(content, "<child>text</child>") + doc2 = read(temp, Node) + @test nodetype(doc2) == Document + # Find the root element + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "child" + @test simple_value(child) == "text" + rm(temp) + end + + @testset "read from IO" begin + xml = """<?xml version="1.0"?><root>hello</root>""" + doc = read(IOBuffer(xml), Node) + @test nodetype(doc) == Document + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test simple_value(root) == "hello" + end +end + +#==============================================================================# +# PARSE → WRITE → PARSE ROUNDTRIP # +#==============================================================================# +@testset "Roundtrip: parse → write preserves semantics" begin + @testset "declaration and root" begin + xml = """<?xml version="1.0"?><root/>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc2)) + @test length(decls) == 1 + @test decls[1]["version"] == "1.0" + els = filter(x -> nodetype(x) == Element, children(doc2)) + @test length(els) == 1 + @test tag(els[1]) == "root" + end + + @testset "element with attributes and text" begin + xml = """<root><child attr="val">text</child></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "child" + @test child["attr"] == "val" + text_children = filter(x -> nodetype(x) == Text, children(child)) + @test any(t -> value(t) == "text", text_children) + end + + @testset "all special node types survive roundtrip" begin + xml = """<root><!-- comment --><![CDATA[data]]><?pi content?></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + types = map(nodetype, filter(x -> nodetype(x) != Text, children(root))) + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + end + + @testset "DOCTYPE survives roundtrip" begin + xml = """<!DOCTYPE html><html><body/></html>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + dtds = filter(x -> nodetype(x) == DTD, children(doc2)) + @test length(dtds) == 1 + @test value(dtds[1]) == "html" + end + + @testset "namespace attributes survive roundtrip" begin + xml = """<root xmlns:ns="http://example.com"><ns:child/></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["xmlns:ns"] == "http://example.com" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "ns:child" + end + + @testset "mixed content survives roundtrip" begin + xml = """<p>Hello <b>world</b>!</p>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + non_ws = filter(x -> !(nodetype(x) == Text && isempty(strip(value(x)))), children(root)) + texts = [value(x) for x in non_ws if nodetype(x) == Text] + @test any(t -> contains(t, "Hello"), texts) + @test any(t -> contains(t, "!"), texts) + bolds = filter(x -> nodetype(x) == Element && tag(x) == "b", non_ws) + @test length(bolds) == 1 + @test simple_value(bolds[1]) == "world" + end +end + +@testset "Roundtrip: file-based semantic preservation" begin + all_files = filter(isfile, [ + joinpath(@__DIR__, "data", "xml.xsd"), + joinpath(@__DIR__, "data", "kml.xsd"), + joinpath(@__DIR__, "data", "books.xml"), + # example.kml uses invalid <![CData[...]]> (lowercase), skip roundtrip + joinpath(@__DIR__, "data", "simple_dtd.xml"), + joinpath(@__DIR__, "data", "preserve.xml"), + ]) + + for path in all_files + node = read(path, Node) + temp = tempname() * ".xml" + XML.write(temp, node) + node2 = read(temp, Node) + # Verify structural properties are preserved + @test nodetype(node) == nodetype(node2) + # Count non-whitespace elements + count_elements(n) = sum(1 for c in children(n) if nodetype(c) == Element; init=0) + @test count_elements(node) == count_elements(node2) + rm(temp) + end +end + +#==============================================================================# +# PARSE Node{SubString{String}} # +#==============================================================================# +@testset "Parse with SubString{String}" begin + xml = """<?xml version="1.0"?><root attr="val"><child>text</child></root>""" + doc = parse(xml, Node{SubString{String}}) + @test nodetype(doc) == Document + @test tag(doc[2]) == "root" + @test doc[2]["attr"] == "val" + # SubString values + @test value(doc[2][1][1]) isa SubString{String} +end + +#==============================================================================# +# COMPLEX DOCUMENT PARSING # +#==============================================================================# +@testset "Complex Document Parsing" begin + @testset "books.xml" begin + path = joinpath(@__DIR__, "data", "books.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + # Should have declaration + catalog + decl_nodes = filter(x -> nodetype(x) == Declaration, children(doc)) + @test length(decl_nodes) == 1 + @test decl_nodes[1]["version"] == "1.0" + + el_nodes = filter(x -> nodetype(x) == Element, children(doc)) + @test length(el_nodes) == 1 + catalog = el_nodes[1] + @test tag(catalog) == "catalog" + + # Catalog has 12 books + books = filter(x -> nodetype(x) == Element, children(catalog)) + @test length(books) == 12 + + # First book + book1 = books[1] + @test book1["id"] == "bk101" + + # Each book has: author, title, genre, price, publish_date, description + book_children = filter(x -> nodetype(x) == Element, children(book1)) + book_tags = map(tag, book_children) + @test "author" in book_tags + @test "title" in book_tags + @test "genre" in book_tags + @test "price" in book_tags + @test "publish_date" in book_tags + @test "description" in book_tags + + author = first(filter(x -> tag(x) == "author", book_children)) + @test simple_value(author) == "Gambardella, Matthew" + end + + @testset "simple_dtd.xml" begin + path = joinpath(@__DIR__, "data", "simple_dtd.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + @test contains(value(dtd_nodes[1]), "ENTITY") + end + + @testset "preserve.xml" begin + path = joinpath(@__DIR__, "data", "preserve.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + root = filter(x -> nodetype(x) == Element, children(doc))[1] + @test tag(root) == "root" + @test root["xml:space"] == "preserve" + + child_els = filter(x -> nodetype(x) == Element, children(root)) + @test length(child_els) == 1 + @test tag(child_els[1]) == "child" + @test child_els[1]["xml:space"] == "default" + end + + @testset "example.kml" begin + # example.kml uses invalid <![CData[...]]> (lowercase 'd') which is not valid XML + path = joinpath(@__DIR__, "data", "example.kml") + isfile(path) || return + @test_throws ArgumentError read(path, Node) + end + + @testset "tv.dtd" begin + path = joinpath(@__DIR__, "data", "tv.dtd") + isfile(path) || return + dtd_text = read(path, String) + pd = parse_dtd("TVSCHEDULE [\n" * dtd_text * "\n]") + @test pd.root == "TVSCHEDULE" + + @test length(pd.elements) == 10 + elem_names = map(e -> e.name, pd.elements) + @test "TVSCHEDULE" in elem_names + @test "CHANNEL" in elem_names + @test "PROGRAMSLOT" in elem_names + @test "TITLE" in elem_names + + @test length(pd.attributes) == 5 + attr_elements = map(a -> a.element, pd.attributes) + @test "TVSCHEDULE" in attr_elements + @test "CHANNEL" in attr_elements + @test "TITLE" in attr_elements + end +end + +#==============================================================================# +# DTD PARSING (parse_dtd) # +#==============================================================================# +@testset "DTD Parsing (parse_dtd)" begin + @testset "simple DTD with entities" begin + path = joinpath(@__DIR__, "data", "simple_dtd.xml") + isfile(path) || return + doc = read(path, Node) + dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc))) + pd = parse_dtd(dtd_node) + @test pd.root == "note" + @test length(pd.entities) == 3 + @test pd.entities[1].name == "nbsp" + @test pd.entities[2].name == "writer" + @test pd.entities[3].name == "copyright" + @test pd.entities[2].value == "Writer: Donald Duck." + end + + @testset "DTD with SYSTEM external ID" begin + pd = parse_dtd("""root SYSTEM "root.dtd\"""") + @test pd.root == "root" + @test pd.system_id == "root.dtd" + @test pd.public_id === nothing + end + + @testset "DTD with PUBLIC external ID" begin + pd = parse_dtd("""root PUBLIC "-//W3C//DTD XHTML 1.0//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"""") + @test pd.root == "root" + @test pd.public_id == "-//W3C//DTD XHTML 1.0//EN" + @test pd.system_id == "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + end + + @testset "DTD with ELEMENT declarations" begin + pd = parse_dtd("""root [ +<!ELEMENT root (child)> +<!ELEMENT child (#PCDATA)> +<!ELEMENT empty EMPTY> +<!ELEMENT any ANY> +]""") + @test pd.root == "root" + @test length(pd.elements) == 4 + @test pd.elements[1].name == "root" + @test pd.elements[1].content == "(child)" + @test pd.elements[2].name == "child" + @test pd.elements[2].content == "(#PCDATA)" + @test pd.elements[3].name == "empty" + @test pd.elements[3].content == "EMPTY" + @test pd.elements[4].name == "any" + @test pd.elements[4].content == "ANY" + end + + @testset "DTD with ATTLIST declarations (spec examples)" begin + pd = parse_dtd("""root [ +<!ATTLIST termdef id ID #REQUIRED name CDATA #IMPLIED> +<!ATTLIST list type (bullets|ordered|glossary) "ordered"> +<!ATTLIST form method CDATA #FIXED "POST"> +]""") + @test length(pd.attributes) == 4 + @test pd.attributes[1].element == "termdef" + @test pd.attributes[1].name == "id" + @test pd.attributes[1].type == "ID" + @test pd.attributes[1].default == "#REQUIRED" + @test pd.attributes[2].name == "name" + @test pd.attributes[2].type == "CDATA" + @test pd.attributes[2].default == "#IMPLIED" + @test pd.attributes[3].element == "list" + @test pd.attributes[3].name == "type" + @test pd.attributes[3].default == "\"ordered\"" + @test pd.attributes[4].element == "form" + @test pd.attributes[4].name == "method" + @test pd.attributes[4].default == "#FIXED \"POST\"" + end + + @testset "DTD with ENTITY declarations (spec examples)" begin + pd = parse_dtd("""root [ +<!ENTITY Pub-Status "This is a pre-release of the specification."> +<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> +<!ENTITY open-hatch2 PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> +<!ENTITY % YN '"Yes"'> +]""") + @test length(pd.entities) == 4 + @test pd.entities[1].name == "Pub-Status" + @test pd.entities[1].value == "This is a pre-release of the specification." + @test pd.entities[1].parameter == false + + @test pd.entities[2].name == "open-hatch" + @test pd.entities[2].value === nothing + @test contains(pd.entities[2].external_id, "SYSTEM") + + @test pd.entities[3].name == "open-hatch2" + @test contains(pd.entities[3].external_id, "PUBLIC") + + @test pd.entities[4].name == "YN" + @test pd.entities[4].parameter == true + end + + @testset "DTD with NOTATION declarations (spec example)" begin + pd = parse_dtd("""root [ +<!NOTATION vrml PUBLIC "VRML 1.0"> +<!NOTATION jpeg SYSTEM "image/jpeg"> +]""") + @test length(pd.notations) == 2 + @test pd.notations[1].name == "vrml" + @test contains(pd.notations[1].external_id, "PUBLIC") + @test pd.notations[2].name == "jpeg" + @test contains(pd.notations[2].external_id, "SYSTEM") + end + + @testset "parse_dtd from Node" begin + dtd = DTD("root [<!ELEMENT root (#PCDATA)>]") + pd = parse_dtd(dtd) + @test pd.root == "root" + @test length(pd.elements) == 1 + end + + @testset "parse_dtd errors on non-DTD node" begin + @test_throws ErrorException parse_dtd(Element("x")) + end + + @testset "complex DTD file (structure test)" begin + # complex_dtd.xml uses parameter entity references (%text;) which parse_dtd + # does not expand, so we just verify parsing the XML document itself works + path = joinpath(@__DIR__, "data", "complex_dtd.xml") + isfile(path) || return + doc = read(path, Node) + dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test nodetype(dtd_node) == DTD + @test contains(value(dtd_node), "test") + @test contains(value(dtd_node), "ELEMENT") + @test contains(value(dtd_node), "ATTLIST") + @test contains(value(dtd_node), "NOTATION") + @test contains(value(dtd_node), "ENTITY") + end +end + +#==============================================================================# +# XML 1.0 SPEC: ELEMENT TYPE DECLARATIONS (Section 3.2) # +#==============================================================================# +@testset "Spec 3.2: Element Type Declarations" begin + @testset "EMPTY content model" begin + pd = parse_dtd("root [<!ELEMENT br EMPTY>]") + @test pd.elements[1].content == "EMPTY" + end + + @testset "ANY content model" begin + pd = parse_dtd("root [<!ELEMENT container ANY>]") + @test pd.elements[1].content == "ANY" + end + + @testset "#PCDATA content model" begin + pd = parse_dtd("root [<!ELEMENT text (#PCDATA)>]") + @test pd.elements[1].content == "(#PCDATA)" + end + + @testset "mixed content model" begin + pd = parse_dtd("root [<!ELEMENT p (#PCDATA|emph)*>]") + @test pd.elements[1].content == "(#PCDATA|emph)*" + end + + @testset "sequence content model" begin + pd = parse_dtd("root [<!ELEMENT spec (front, body, back?)>]") + @test pd.elements[1].content == "(front, body, back?)" + end + + @testset "choice content model" begin + pd = parse_dtd("root [<!ELEMENT div1 (head, (p | list | note)*, div2*)>]") + @test pd.elements[1].content == "(head, (p | list | note)*, div2*)" + end +end + +#==============================================================================# +# XML 1.0 SPEC: ATTRIBUTE-LIST DECLARATIONS (Section 3.3) # +#==============================================================================# +@testset "Spec 3.3: Attribute-List Declarations" begin + @testset "ID attribute" begin + pd = parse_dtd("root [<!ATTLIST el id ID #REQUIRED>]") + @test pd.attributes[1].type == "ID" + @test pd.attributes[1].default == "#REQUIRED" + end + + @testset "CDATA attribute with default" begin + pd = parse_dtd("""root [<!ATTLIST el name CDATA "default">]""") + @test pd.attributes[1].type == "CDATA" + @test pd.attributes[1].default == "\"default\"" + end + + @testset "enumerated attribute" begin + pd = parse_dtd("""root [<!ATTLIST list type (bullets|ordered|glossary) "ordered">]""") + @test contains(pd.attributes[1].type, "bullets") + @test pd.attributes[1].default == "\"ordered\"" + end + + @testset "#IMPLIED attribute" begin + pd = parse_dtd("root [<!ATTLIST el opt CDATA #IMPLIED>]") + @test pd.attributes[1].default == "#IMPLIED" + end + + @testset "#FIXED attribute" begin + pd = parse_dtd("""root [<!ATTLIST el method CDATA #FIXED "POST">]""") + @test pd.attributes[1].default == "#FIXED \"POST\"" + end + + @testset "NOTATION attribute type" begin + pd = parse_dtd("root [<!ATTLIST fig notation NOTATION (jpeg|png) #IMPLIED>]") + @test contains(pd.attributes[1].type, "NOTATION") + end + + @testset "multiple attributes in one ATTLIST" begin + pd = parse_dtd("""root [<!ATTLIST book + id ID #REQUIRED + isbn CDATA #IMPLIED + format (hardcover|paperback|ebook) "paperback">]""") + @test length(pd.attributes) == 3 + @test pd.attributes[1].name == "id" + @test pd.attributes[2].name == "isbn" + @test pd.attributes[3].name == "format" + end +end + +#==============================================================================# +# XML 1.0 SPEC: ENTITY DECLARATIONS (Section 4.2) # +#==============================================================================# +@testset "Spec 4.2: Entity Declarations" begin + @testset "internal general entity (spec example)" begin + pd = parse_dtd("""root [<!ENTITY Pub-Status "This is a pre-release of the specification.">]""") + @test pd.entities[1].name == "Pub-Status" + @test pd.entities[1].value == "This is a pre-release of the specification." + @test pd.entities[1].external_id === nothing + @test pd.entities[1].parameter == false + end + + @testset "external entity with SYSTEM (spec example)" begin + pd = parse_dtd("""root [<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">]""") + @test pd.entities[1].name == "open-hatch" + @test pd.entities[1].value === nothing + @test contains(pd.entities[1].external_id, "SYSTEM") + @test contains(pd.entities[1].external_id, "http://www.textuality.com/boilerplate/OpenHatch.xml") + end + + @testset "external entity with PUBLIC (spec example)" begin + pd = parse_dtd("""root [<!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml">]""") + @test pd.entities[1].name == "open-hatch" + @test contains(pd.entities[1].external_id, "PUBLIC") + end + + @testset "parameter entity" begin + pd = parse_dtd("""root [<!ENTITY % YN '"Yes"'>]""") + @test pd.entities[1].name == "YN" + @test pd.entities[1].parameter == true + end +end + +#==============================================================================# +# XML 1.0 SPEC: NOTATION DECLARATIONS (Section 4.7) # +#==============================================================================# +@testset "Spec 4.7: Notation Declarations" begin + @testset "NOTATION with PUBLIC (spec example)" begin + pd = parse_dtd("""root [<!NOTATION vrml PUBLIC "VRML 1.0">]""") + @test pd.notations[1].name == "vrml" + @test contains(pd.notations[1].external_id, "PUBLIC") + @test contains(pd.notations[1].external_id, "VRML 1.0") + end + + @testset "NOTATION with SYSTEM" begin + pd = parse_dtd("""root [<!NOTATION jpeg SYSTEM "image/jpeg">]""") + @test pd.notations[1].name == "jpeg" + @test contains(pd.notations[1].external_id, "SYSTEM") + end +end + +#==============================================================================# +# ERROR HANDLING # +#==============================================================================# +@testset "Error Handling" begin + @testset "mismatched tags" begin + @test_throws ErrorException parse("<a></b>", Node) + end + + @testset "unclosed tag" begin + @test_throws ErrorException parse("<a><b></a>", Node) + end + + @testset "closing tag with no open tag" begin + @test_throws ErrorException parse("</a>", Node) + end + + @testset "unclosed root element" begin + @test_throws ErrorException parse("<root>", Node) + end + + @testset "unterminated comment" begin + @test_throws Exception parse("<root><!-- no end", Node) + end + + @testset "unterminated CDATA" begin + @test_throws Exception parse("<root><![CDATA[no end", Node) + end + + @testset "unterminated PI" begin + @test_throws Exception parse("<?pi no end", Node) + end + + @testset "unterminated attribute value" begin + @test_throws Exception parse("""<a b="no end""", Node) + end +end + +#==============================================================================# +# ILL-FORMED XML (must error) # +#==============================================================================# +@testset "Ill-Formed XML" begin + # ---- Tag structure ---- + @testset "mismatched close tag" begin + @test_throws Exception parse("<a></b>", Node) + end + + @testset "overlapping elements" begin + @test_throws Exception parse("<a><b></a></b>", Node) + end + + @testset "deeply mismatched nesting" begin + @test_throws Exception parse("<a><b><c></b></c></a>", Node) + end + + @testset "multiple unclosed tags" begin + @test_throws Exception parse("<a><b><c>", Node) + end + + @testset "close tag without open" begin + @test_throws Exception parse("</a>", Node) + end + + @testset "close tag after self-closing" begin + @test_throws Exception parse("<a/></a>", Node) + end + + @testset "nested close tag without open" begin + @test_throws Exception parse("<root></inner></root>", Node) + end + + # ---- Unterminated constructs ---- + @testset "unterminated open tag at EOF" begin + @test_throws Exception parse("<root><unclosed", Node) + end + + @testset "unterminated attribute value (double quote)" begin + @test_throws Exception parse("""<a x="no end""", Node) + end + + @testset "unterminated attribute value (single quote)" begin + @test_throws Exception parse("<a x='no end", Node) + end + + @testset "unterminated comment" begin + @test_throws Exception parse("<!-- no end", Node) + end + + @testset "unterminated CDATA" begin + @test_throws Exception parse("<![CDATA[no end", Node) + end + + @testset "unterminated processing instruction" begin + @test_throws Exception parse("<?pi no end", Node) + end + + @testset "unterminated DOCTYPE" begin + @test_throws Exception parse("<!DOCTYPE x", Node) + end + + # ---- Attribute errors ---- + @testset "duplicate attribute on element" begin + @test_throws Exception parse("""<a x="1" x="2"/>""", Node) + end + + @testset "duplicate attribute (different values)" begin + @test_throws Exception parse("""<root attr="a" attr="b"></root>""", Node) + end + + @testset "duplicate attribute in declaration" begin + @test_throws Exception parse("""<?xml version="1.0" version="1.1"?><a/>""", Node) + end + + @testset "attribute without value" begin + @test_throws Exception parse("<a disabled/>", Node) + end + + @testset "attribute with unquoted value" begin + @test_throws Exception parse("<a x=hello/>", Node) + end + + # ---- Tokenizer-level errors ---- + @testset "lone <" begin + @test_throws Exception parse("<", Node) + end + + @testset "lone < in text content" begin + @test_throws Exception parse("<root>a < b</root>", Node) + end + + @testset "tag with space before name" begin + @test_throws Exception parse("< root/>", Node) + end +end + +#==============================================================================# +# UNICODE SUPPORT # +#==============================================================================# +@testset "Unicode Support" begin + @testset "Unicode in text content" begin + doc = parse("<root>caf\u00e9 \u00f1 \u65e5\u672c\u8a9e</root>", Node) + @test simple_value(doc[1]) == "caf\u00e9 \u00f1 \u65e5\u672c\u8a9e" + end + + @testset "Unicode in attribute values" begin + doc = parse("<root name=\"\u00fcber\"/>", Node) + @test doc[1]["name"] == "\u00fcber" + end + + @testset "Unicode in comments" begin + doc = parse("<root><!-- h\u00e9llo --></root>", Node) + @test value(doc[1][1]) == " h\u00e9llo " + end + + @testset "CJK characters" begin + doc = parse("<root>\u4e2d\u6587</root>", Node) + @test simple_value(doc[1]) == "\u4e2d\u6587" + end + + @testset "emoji in text" begin + doc = parse("<root>\U0001f600\U0001f680</root>", Node) + @test simple_value(doc[1]) == "\U0001f600\U0001f680" + end + + @testset "Cyrillic characters" begin + doc = parse("<root>\u041f\u0440\u0438\u0432\u0435\u0442</root>", Node) + @test simple_value(doc[1]) == "\u041f\u0440\u0438\u0432\u0435\u0442" + end + + @testset "Arabic characters" begin + doc = parse("<root>\u0645\u0631\u062d\u0628\u0627</root>", Node) + @test simple_value(doc[1]) == "\u0645\u0631\u062d\u0628\u0627" + end +end + +#==============================================================================# +# EDGE CASES # +#==============================================================================# +@testset "Edge Cases" begin + @testset "document with only whitespace around root" begin + doc = parse(" \n <root/>\n ", Node) + # Parser preserves whitespace as Text nodes + els = filter(x -> nodetype(x) == Element, children(doc)) + @test length(els) == 1 + @test tag(els[1]) == "root" + end + + @testset "deeply nested elements" begin + xml = "<a><b><c><d><e><f>deep</f></e></d></c></b></a>" + doc = parse(xml, Node) + @test simple_value(doc[1][1][1][1][1][1]) == "deep" + end + + @testset "many siblings" begin + items = join(["<item>$i</item>" for i in 1:100]) + xml = "<root>$items</root>" + doc = parse(xml, Node) + @test length(doc[1]) == 100 + @test simple_value(doc[1][1]) == "1" + @test simple_value(doc[1][100]) == "100" + end + + @testset "element with hyphens and dots in name" begin + doc = parse("<my-element.name/>", Node) + @test tag(doc[1]) == "my-element.name" + end + + @testset "element with underscore in name" begin + doc = parse("<_private/>", Node) + @test tag(doc[1]) == "_private" + end + + @testset "attribute with numeric value" begin + doc = parse("""<x count="42"/>""", Node) + @test doc[1]["count"] == "42" + end + + @testset "empty text content" begin + doc = parse("<root></root>", Node) + @test isnothing(doc[1].children) + end + + @testset "adjacent CDATA and text" begin + doc = parse("<root>text<![CDATA[cdata]]>more</root>", Node) + @test length(doc[1]) == 3 + @test value(doc[1][1]) == "text" + @test value(doc[1][2]) == "cdata" + @test value(doc[1][3]) == "more" + end + + @testset "multiple CDATA sections" begin + doc = parse("<root><![CDATA[a]]><![CDATA[b]]></root>", Node) + @test length(doc[1]) == 2 + @test value(doc[1][1]) == "a" + @test value(doc[1][2]) == "b" + end + + @testset "comment between elements" begin + doc = parse("<root><a/><!-- between --><b/></root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][2]) == Comment + end + + @testset "PI between elements" begin + doc = parse("<root><a/><?pi data?><b/></root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][2]) == ProcessingInstruction + end + + @testset "all node types in one document" begin + xml = """<?xml version="1.0"?> +<!DOCTYPE root SYSTEM "root.dtd"> +<!-- comment --> +<?pi data?> +<root> + text + <child attr="val"/> + <!-- inner comment --> + <![CDATA[cdata]]> + <?inner-pi inner data?> +</root>""" + doc = parse(xml, Node) + types = map(nodetype, children(doc)) + @test Declaration in types + @test DTD in types + @test Comment in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "very long attribute value" begin + long_val = repeat("a", 10000) + doc = parse("""<x attr="$(long_val)"/>""", Node) + @test doc[1]["attr"] == long_val + end + + @testset "very long text content" begin + long_text = repeat("hello ", 10000) + doc = parse("<root>$(long_text)</root>", Node) + @test simple_value(doc[1]) == long_text + end + + @testset "CDATA with ]] but not followed by >" begin + doc = parse("<root><![CDATA[a]]b]]></root>", Node) + @test value(doc[1][1]) == "a]]b" + end +end + +#==============================================================================# +# SPEC EXAMPLES: FULL DOCUMENTS # +#==============================================================================# +@testset "Full Spec-Like Documents" begin + @testset "spec section 2.1: minimal document" begin + xml = """<?xml version="1.0"?> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + @test simple_value(doc[end]) == "Hello, world!" + end + + @testset "spec section 2.8: document with external DTD" begin + xml = """<?xml version="1.0"?> +<!DOCTYPE greeting SYSTEM "hello.dtd"> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + # Filter out whitespace text nodes to check structure + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test length(typed) == 3 + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Element + end + + @testset "spec: document with internal subset" begin + xml = """<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> +]> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test typed[1]["encoding"] == "UTF-8" + @test nodetype(typed[2]) == DTD + pd = parse_dtd(typed[2]) + @test pd.root == "greeting" + @test length(pd.elements) == 1 + @test pd.elements[1].name == "greeting" + @test pd.elements[1].content == "(#PCDATA)" + @test simple_value(typed[3]) == "Hello, world!" + end + + @testset "typical HTML5-like doctype" begin + xml = """<!DOCTYPE html><html><head><title>Test

Content

""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == DTD + @test value(doc[1]) == "html" + @test tag(doc[2]) == "html" + end + + @testset "SVG document" begin + xml = """ + + + Hello SVG +""" + doc = parse(xml, Node) + svg = doc[end] + @test tag(svg) == "svg" + @test svg["xmlns"] == "http://www.w3.org/2000/svg" + @test svg["width"] == "100" + + elements = filter(x -> nodetype(x) == Element, children(svg)) + @test length(elements) == 2 + @test tag(elements[1]) == "circle" + @test elements[1]["fill"] == "red" + @test tag(elements[2]) == "text" + @test value(elements[2][1]) == "Hello SVG" + end + + @testset "SOAP-like envelope" begin + xml = """ + + + + + IBM + + +""" + doc = parse(xml, Node) + env = doc[end] + @test tag(env) == "soap:Envelope" + elements = filter(x -> nodetype(x) == Element, children(env)) + @test tag(elements[1]) == "soap:Header" + @test tag(elements[2]) == "soap:Body" + end + + @testset "RSS-like feed" begin + xml = """ + + + Example Feed + http://example.com + An example RSS feed + + Item 1 + http://example.com/1 + + + Item 2 + http://example.com/2 + + +""" + doc = parse(xml, Node) + rss = doc[end] + @test tag(rss) == "rss" + @test rss["version"] == "2.0" + channel = first(filter(x -> nodetype(x) == Element, children(rss))) + @test tag(channel) == "channel" + items = filter(x -> nodetype(x) == Element && tag(x) == "item", children(channel)) + @test length(items) == 2 + end + + @testset "Atom-like feed" begin + xml = """ + + Example Feed + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + +""" + doc = parse(xml, Node) + feed = doc[end] + @test tag(feed) == "feed" + @test feed["xmlns"] == "http://www.w3.org/2005/Atom" + entries = filter(x -> nodetype(x) == Element && tag(x) == "entry", children(feed)) + @test length(entries) == 1 + end + + @testset "MathML-like document" begin + xml = """ + + + x + 2 + + + + 1 + +""" + doc = parse(xml, Node) + math = doc[1] + @test tag(math) == "math" + @test math["xmlns"] == "http://www.w3.org/1998/Math/MathML" + end + + @testset "document with processing instructions and comments mixed" begin + xml = """ + + + + + + + +""" + doc = parse(xml, Node) + types = map(nodetype, children(doc)) + @test count(==(Comment), types) == 2 + @test count(==(ProcessingInstruction), types) >= 1 + @test count(==(Element), types) == 1 + end +end + +#==============================================================================# +# SHOW / DISPLAY # +#==============================================================================# +@testset "Show (REPL display)" begin + @testset "show Text" begin + t = Text("hello") + s = sprint(show, t) + @test contains(s, "Text") + @test contains(s, "hello") + end + + @testset "show Element" begin + el = Element("div"; class="main") + s = sprint(show, el) + @test contains(s, "Element") + @test contains(s, "hello

" + end +end + +#==============================================================================# +# SHOW (text/xml MIME) ROUNDTRIP # +#==============================================================================# +@testset "text/xml MIME output" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", Element("child", "text")) + ) + xml_str = sprint(show, MIME("text/xml"), doc) + @test contains(xml_str, "") + @test contains(xml_str, "text") + # Verify it's parseable + doc2 = parse(xml_str, Node) + @test nodetype(doc2) == Document + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test tag(root) == "root" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test simple_value(child) == "text" +end + +#==============================================================================# +# CONSTRUCTION → WRITE → PARSE ROUNDTRIP # +#==============================================================================# +@testset "Construction → Write → Parse" begin + @testset "simple element: write then parse preserves semantics" begin + el = Element("greeting", "Hello, world!") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test simple_value(doc2[1]) == "Hello, world!" + end + + @testset "element with attributes: write then parse preserves attributes" begin + el = Element("item"; id="1", class="active") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test doc2[1]["id"] == "1" + @test doc2[1]["class"] == "active" + end + + @testset "single-child text elements roundtrip" begin + doc = Document(Element("root", "text")) + xml = XML.write(doc) + doc2 = parse(xml, Node) + @test doc == doc2 + end + + @testset "self-closing elements roundtrip" begin + doc = Document(Element("root")) + xml = XML.write(doc) + doc2 = parse(xml, Node) + @test doc == doc2 + end + + @testset "all node types survive write → parse" begin + doc = Document( + Declaration(; version="1.0"), + Comment(" header "), + Element("root", + Element("child", "text"), + CData("raw "), + Comment(" inner "), + ProcessingInstruction("pi", "content") + ) + ) + xml = XML.write(doc) + doc2 = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc2)) + @test count(==(Declaration), map(nodetype, typed)) == 1 + @test count(==(Comment), map(nodetype, typed)) == 1 + @test count(==(Element), map(nodetype, typed)) == 1 + root = first(filter(x -> nodetype(x) == Element, typed)) + inner = filter(x -> nodetype(x) != Text, children(root)) + inner_types = map(nodetype, inner) + @test Element in inner_types + @test CData in inner_types + @test Comment in inner_types + @test ProcessingInstruction in inner_types + end + + @testset "special characters in text roundtrip" begin + el = Element("p", "a < b & c > d ' e \" f") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test simple_value(doc2[1]) == "a < b & c > d ' e \" f" + end + + @testset "special characters in attributes roundtrip" begin + el = Element("x"; data="a&bd'e\"f") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test doc2[1]["data"] == "a&bd'e\"f" + end +end + +#==============================================================================# +# KML-LIKE DOCUMENT # +#==============================================================================# +@testset "KML-like Document" begin + xml = """ + + + KML Sample + + Simple placemark + Attached to the ground. + + -122.0822035,37.4220033612141,0 + + + +""" + doc = parse(xml, Node) + kml = doc[end] + @test tag(kml) == "kml" + @test kml["xmlns"] == "http://www.opengis.net/kml/2.2" + + document = first(filter(x -> nodetype(x) == Element, children(kml))) + @test tag(document) == "Document" + + name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(document))) + @test simple_value(name) == "KML Sample" + + pm = first(filter(x -> nodetype(x) == Element && tag(x) == "Placemark", children(document))) + pm_name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(pm))) + @test simple_value(pm_name) == "Simple placemark" +end + +#==============================================================================# +# XHTML-LIKE DOCUMENT # +#==============================================================================# +@testset "XHTML-like Document" begin + xml = """ + + + + XHTML Test + + + +

Hello World

+

This is a test of XHTML.

+
+ An image + +""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test contains(value(typed[2]), "PUBLIC") + + html = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(html) == "html" + @test html["xmlns"] == "http://www.w3.org/1999/xhtml" + + head_el = first(filter(x -> nodetype(x) == Element && tag(x) == "head", children(html))) + title_el = first(filter(x -> nodetype(x) == Element && tag(x) == "title", children(head_el))) + @test simple_value(title_el) == "XHTML Test" + + body_el = first(filter(x -> nodetype(x) == Element && tag(x) == "body", children(html))) + h1_el = first(filter(x -> nodetype(x) == Element && tag(x) == "h1", children(body_el))) + @test simple_value(h1_el) == "Hello World" + + # Verify write produces valid XML that can be re-parsed + xml2 = XML.write(doc) + doc2 = parse(xml2, Node) + @test nodetype(doc2) == Document +end + +#==============================================================================# +# PLIST-LIKE DOCUMENT # +#==============================================================================# +@testset "plist-like Document" begin + xml = """ + + + + CFBundleName + MyApp + CFBundleVersion + 1.0 + LSRequiresIPhoneOS + + +""" + doc = parse(xml, Node) + plist = doc[end] + @test tag(plist) == "plist" + @test plist["version"] == "1.0" + + dict = first(filter(x -> nodetype(x) == Element, children(plist))) + @test tag(dict) == "dict" + + elements = filter(x -> nodetype(x) == Element, children(dict)) + keys_found = [simple_value(e) for e in elements if tag(e) == "key"] + @test "CFBundleName" in keys_found + @test "CFBundleVersion" in keys_found +end + +#==============================================================================# +# MAVEN POM-LIKE DOCUMENT # +#==============================================================================# +@testset "Maven POM-like Document" begin + xml = """ + + 4.0.0 + com.example + my-app + 1.0-SNAPSHOT + + + junit + junit + 4.13.2 + test + + +""" + doc = parse(xml, Node) + project = doc[end] + @test tag(project) == "project" + + elements = filter(x -> nodetype(x) == Element, children(project)) + version = first(filter(x -> tag(x) == "version", elements)) + @test simple_value(version) == "1.0-SNAPSHOT" + + deps = first(filter(x -> tag(x) == "dependencies", elements)) + dep_list = filter(x -> nodetype(x) == Element, children(deps)) + @test length(dep_list) == 1 + @test tag(dep_list[1]) == "dependency" +end + +#==============================================================================# +# GITHUB ISSUES REGRESSION TESTS # +#==============================================================================# +@testset "GitHub Issues" begin + + #--- Issue #7: attribute order should not affect equality --- + @testset "#7: attribute-order-insensitive ==" begin + a = Element("x"; first="1", second="2") + b = Element("x"; second="2", first="1") + @test a == b + + # Same attrs same order still works + c = Element("x"; a="1", b="2") + d = Element("x"; a="1", b="2") + @test c == d + + # Different values are still not equal + @test Element("x"; a="1") != Element("x"; a="2") + + # Different attr names are not equal + @test Element("x"; a="1") != Element("x"; b="1") + + # Different number of attrs + @test Element("x"; a="1") != Element("x"; a="1", b="2") + + # Parsed elements with same attrs in different order + doc1 = parse("""""", Node) + doc2 = parse("""""", Node) + @test doc1[1] == doc2[1] + + # No attrs vs empty attrs (both are "no attributes") + @test Element("x") == Element("x") + end + + #--- Issue #17: numeric character references --- + @testset "#17: numeric character references (&#decimal; and &#xHex;)" begin + # Decimal character references + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("&") == "&" + @test unescape("'") == "'" + @test unescape(""") == "\"" + + # Hex character references (lowercase x) + @test unescape("<") == "<" + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("&") == "&" + @test unescape("'") == "'" + @test unescape(""") == "\"" + + # Uppercase X also works + @test unescape("A") == "A" + + # Unicode character references + @test unescape("A") == "A" + @test unescape("A") == "A" + @test unescape("é") == "\u00e9" # é + @test unescape("é") == "\u00e9" # é + @test unescape("中") == "\u4e2d" # 中 + @test unescape("😀") == "\U0001f600" # 😀 + + # Mixed with named entities + @test unescape("&<<") == "&<<" + @test unescape("<tag>") == "" + + # In parsed XML text + doc = parse("<hello>", Node) + @test simple_value(doc[1]) == "" + + # In parsed XML attributes + doc = parse("""""", Node) + @test doc[1]["a"] == "<>" + + # Non-breaking space + @test unescape(" ") == "\u00a0" + @test unescape(" ") == "\u00a0" + + # Invalid numeric reference preserved verbatim + @test unescape("&#xZZZ;") == "&#xZZZ;" + + # Named entity references that aren't predefined are preserved verbatim + @test unescape("&foo;") == "&foo;" + + # Ampersand without semicolon is preserved + @test unescape("a & b") == "a & b" + end + + #--- Issue #33: empty attributes consistency --- + @testset "#33: empty attributes [] vs nothing" begin + # Constructed elements have empty Vector for attrs + a = Element("x") + # Parsed elements with no attrs have nothing + b = parse("", Node)[1] + # They should compare equal via _eq / _attrs_eq + @test a == b + end + + #--- Issue #35: write → parse preserves structure --- + @testset "#35: write then parse preserves structure" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", + Element("child", "text"), + Element("empty") + ) + ) + xml = XML.write(doc) + doc2 = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child_elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(child_elements) == 2 + @test tag(child_elements[1]) == "child" + @test tag(child_elements[2]) == "empty" + end + + #--- Issue #50: Base.get with default --- + @testset "#50: Base.get(node, key, default)" begin + el = parse("""""", Node)[1] + + # Existing keys return their values + @test get(el, "a", "default") == "1" + @test get(el, "b", "default") == "2" + + # Non-existing key returns default + @test get(el, "c", "default") == "default" + @test get(el, "c", nothing) === nothing + + # Works on elements with no attributes + el2 = parse("", Node)[1] + @test get(el2, "a", "nope") == "nope" + + # Works on constructed elements + el3 = Element("x"; foo="bar") + @test get(el3, "foo", "default") == "bar" + @test get(el3, "baz", "default") == "default" + end + + #--- Issue #52: escape double-escapes (expected behavior) --- + @testset "#52: escape is not idempotent (by design)" begin + @test escape("&") == "&" + @test escape("&") == "&amp;" # double-escaping is correct + end + + #--- Issue #53: unescape works correctly --- + @testset "#53: unescape works correctly on parsed content" begin + doc = parse("&", Node) + @test simple_value(doc[1]) == "&" + doc = parse("<tag>", Node) + @test simple_value(doc[1]) == "" + end +end + +#==============================================================================# +# TREE NAVIGATION: parent, depth, siblings # +#==============================================================================# +@testset "Tree Navigation" begin + doc = parse("", Node) + root = doc[1] + a = root[1] + a1 = a[1] + a2 = a[2] + b = root[2] + c = root[3] + c1 = c[1] + c1a = c1[1] + + @testset "parent" begin + @test parent(root, doc) === doc + @test parent(a, doc) === root + @test parent(a1, doc) === a + @test parent(c1a, doc) === c1 + @test parent(b, root) === root + @test_throws ErrorException parent(doc, doc) # root has no parent + @test_throws ErrorException parent(Element("x"), doc) # not in tree + end + + @testset "depth" begin + @test depth(doc, doc) == 0 + @test depth(root, doc) == 1 + @test depth(a, doc) == 2 + @test depth(a1, doc) == 3 + @test depth(c1a, doc) == 4 + @test depth(b, root) == 1 + @test_throws ErrorException depth(Element("x"), doc) + end + + @testset "siblings" begin + @test siblings(a, doc) == [b, c] + @test siblings(b, doc) == [a, c] + @test siblings(a1, doc) == [a2] + @test siblings(a2, doc) == [a1] + @test isempty(siblings(c1, doc)) + @test_throws ErrorException siblings(doc, doc) # root has no parent + end + + @testset "1-arg parent/depth errors" begin + @test_throws ErrorException parent(a) + @test_throws ErrorException depth(a) + end +end + +#==============================================================================# +# DEPRECATIONS / REMOVED API # +#==============================================================================# +@testset "Deprecations and Removed API" begin + node = Element("test") + node2 = Element("other") + + @testset "XML.next errors" begin + @test_throws ErrorException XML.next(node) + end + + @testset "XML.prev errors" begin + @test_throws ErrorException XML.prev(node) + end + + @testset "XML.nodes_equal errors" begin + @test_throws ErrorException XML.nodes_equal(node, node2) + end + + @testset "XML.escape! errors" begin + @test_throws ErrorException XML.escape!(node) + @test_throws ErrorException XML.escape!(node, false) + end + + @testset "XML.unescape! errors" begin + @test_throws ErrorException XML.unescape!(node) + @test_throws ErrorException XML.unescape!(node, false) + end + + @testset "XML.Raw errors" begin + @test_throws ErrorException XML.Raw() + @test_throws ErrorException XML.Raw("arg") + end + + @testset "simplevalue binding redirects to simple_value" begin + el = Element("x", "val") + @test XML.simplevalue(el) == simple_value(el) + end +end + +#==============================================================================# +# XPATH # +#==============================================================================# +@testset "XPath" begin + doc = parse(""" + + Alice + Bob + Carol + + dark + """, Node) + + @testset "absolute path" begin + results = xpath(doc, "/root/users/user") + @test length(results) == 3 + @test all(n -> tag(n) == "user", results) + end + + @testset "single child" begin + results = xpath(doc, "/root/settings/theme") + @test length(results) == 1 + @test tag(results[1]) == "theme" + end + + @testset "positional predicate [n]" begin + results = xpath(doc, "/root/users/user[1]") + @test length(results) == 1 + @test results[1]["id"] == "1" + + results = xpath(doc, "/root/users/user[3]") + @test length(results) == 1 + @test results[1]["id"] == "3" + end + + @testset "[last()]" begin + results = xpath(doc, "/root/users/user[last()]") + @test length(results) == 1 + @test results[1]["id"] == "3" + end + + @testset "out of bounds predicate" begin + results = xpath(doc, "/root/users/user[99]") + @test isempty(results) + end + + @testset "has-attribute predicate [@attr]" begin + results = xpath(doc, "/root/users/user[@role]") + @test length(results) == 3 + end + + @testset "attribute-value predicate [@attr='v']" begin + results = xpath(doc, "/root/users/user[@role='admin']") + @test length(results) == 2 + ids = sort([n["id"] for n in results]) + @test ids == ["1", "3"] + end + + @testset "attribute-value with double quotes" begin + results = xpath(doc, """/root/users/user[@id="2"]""") + @test length(results) == 1 + @test results[1]["id"] == "2" + end + + @testset "descendant //" begin + results = xpath(doc, "//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "// with predicate" begin + results = xpath(doc, "//user[@role='admin']/name") + @test length(results) == 2 + end + + @testset "wildcard *" begin + results = xpath(doc, "/root/*") + @test length(results) == 2 + @test Set(tag.(results)) == Set(["users", "settings"]) + end + + @testset "text()" begin + results = xpath(doc, "/root/settings/theme/text()") + @test length(results) == 1 + @test value(results[1]) == "dark" + end + + @testset "node()" begin + results = xpath(doc, "/root/users/user[1]/node()") + @test length(results) >= 1 + end + + @testset "attribute selection @attr" begin + results = xpath(doc, "//user/@id") + @test length(results) == 3 + vals = sort([value(n) for n in results]) + @test vals == ["1", "2", "3"] + end + + @testset "self ." begin + results = xpath(doc, ".") + @test length(results) == 1 + @test results[1] === doc + end + + @testset "no match returns empty" begin + @test isempty(xpath(doc, "/root/nonexistent")) + @test isempty(xpath(doc, "//nonexistent")) + end + + @testset "empty expression" begin + @test isempty(xpath(doc, "")) + end + + @testset "deep // with path" begin + results = xpath(doc, "//theme/text()") + @test length(results) == 1 + @test value(results[1]) == "dark" + end + + @testset "error: unterminated predicate" begin + @test_throws ErrorException xpath(doc, "/root/user[1") + end + + @testset "error: unsupported predicate" begin + @test_throws ErrorException xpath(doc, "/root/user[position()>1]") + end + + @testset "self-closing elements" begin + doc2 = parse("", Node) + @test length(xpath(doc2, "/root/*")) == 3 + end + + @testset "relative path" begin + root = xpath(doc, "/root")[1] + results = xpath(root, "users/user") + @test length(results) == 3 + end + + @testset ".. parent navigation" begin + # /root/users/user[1]/.. goes back to + results = xpath(doc, "/root/users/user[1]/..") + @test length(results) == 1 + @test tag(results[1]) == "users" + end + + @testset ".. in mid-path" begin + # /root/users/.. should go back to root + results = xpath(doc, "/root/users/..") + @test length(results) == 1 + @test tag(results[1]) == "root" + end + + @testset "// mid-path" begin + # /root//name finds all elements anywhere under root + results = xpath(doc, "/root//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "// with wildcard //*" begin + doc2 = parse("", Node) + results = xpath(doc2, "//*") + tags = [tag(n) for n in results if nodetype(n) === Element] + @test "r" in tags + @test "a" in tags + @test "b" in tags + @test "c" in tags + end + + @testset "// with text()" begin + results = xpath(doc, "//text()") + @test length(results) >= 3 # at least Alice, Bob, Carol + vals = [value(n) for n in results] + @test "Alice" in vals + @test "Bob" in vals + @test "dark" in vals + end + + @testset "multiple // segments" begin + results = xpath(doc, "//users//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "chained predicates" begin + results = xpath(doc, "/root/users/user[@role='admin'][1]") + @test length(results) == 1 + @test results[1]["id"] == "1" + end + + @testset "@attr with no match" begin + results = xpath(doc, "//user/@nonexistent") + @test isempty(results) + end + + @testset "namespaced tag" begin + doc2 = parse("""val""", Node) + results = xpath(doc2, "/root/ns:item") + @test length(results) == 1 + @test tag(results[1]) == "ns:item" + end + + @testset "whitespace in expression" begin + results = xpath(doc, " / root / users / user ") + @test length(results) == 3 + end + + @testset "error: empty @" begin + @test_throws ErrorException xpath(doc, "/root/@") + end + + @testset "error: unknown function" begin + @test_throws ErrorException xpath(doc, "/root/foo()") + end + + @testset "error: unexpected character" begin + @test_throws ErrorException xpath(doc, "/root/!bad") + end + + @testset "deep nesting" begin + doc2 = parse("deep", Node) + results = xpath(doc2, "//e/text()") + @test length(results) == 1 + @test value(results[1]) == "deep" + end + + @testset "wildcard with predicate" begin + doc2 = parse("""""", Node) + results = xpath(doc2, "/r/*[@x]") + @test length(results) == 2 + end + + @testset "// from non-document node" begin + root = xpath(doc, "/root")[1] + results = xpath(root, "//name") + @test length(results) == 3 + end +end + +#==============================================================================# +# LAZYNODE # +#==============================================================================# +@testset "LazyNode" begin + @testset "parse and nodetype" begin + doc = parse("", LazyNode) + @test nodetype(doc) == Document + + doc2 = parse(LazyNode, "") + @test nodetype(doc2) == Document + end + + @testset "read from IO" begin + xml = """hello""" + doc = read(IOBuffer(xml), LazyNode) + @test nodetype(doc) == Document + end + + @testset "read from file" begin + path = joinpath(@__DIR__, "data", "books.xml") + isfile(path) || return + doc = read(path, LazyNode) + @test nodetype(doc) == Document + @test length(children(doc)) > 0 + end + + @testset "Document children" begin + xml = """""" + doc = parse(xml, LazyNode) + ch = children(doc) + @test length(ch) == 2 + @test nodetype(ch[1]) == Declaration + @test nodetype(ch[2]) == Element + end + + @testset "Document with all prolog node types" begin + xml = """""" + doc = parse(xml, LazyNode) + ch = children(doc) + types = map(nodetype, ch) + @test Declaration in types + @test DTD in types + @test Comment in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "Element tag" begin + doc = parse("", LazyNode) + @test tag(doc[1]) == "root" + end + + @testset "tag returns nothing for non-element/PI" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test nodetype(text_node) == Text + @test tag(text_node) === nothing + end + + @testset "Element attributes" begin + doc = parse("""""", LazyNode) + attrs = attributes(doc[1]) + @test attrs isa Attributes + @test attrs["a"] == "1" + @test attrs["b"] == "2" + end + + @testset "Element with no attributes" begin + doc = parse("", LazyNode) + @test attributes(doc[1]) === nothing + end + + @testset "attributes returns nothing for non-element" begin + doc = parse("text", LazyNode) + @test attributes(children(doc[1])[1]) === nothing + end + + @testset "attributes unescape entity references" begin + doc = parse("""""", LazyNode) + @test doc[1]["a"] == "a&b" + end + + @testset "Declaration attributes" begin + doc = parse("""""", LazyNode) + decl = doc[1] + @test nodetype(decl) == Declaration + attrs = attributes(decl) + @test attrs["version"] == "1.0" + @test attrs["encoding"] == "UTF-8" + end + + @testset "get with default" begin + doc = parse("""""", LazyNode) + el = doc[1] + @test get(el, "a", "nope") == "1" + @test get(el, "b", "nope") == "nope" + end + + @testset "get on non-element returns default" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test get(text_node, "a", "default") == "default" + end + + @testset "getindex with string key" begin + doc = parse("""""", LazyNode) + @test doc[1]["a"] == "1" + @test_throws KeyError doc[1]["nonexistent"] + end + + @testset "haskey" begin + doc = parse("""""", LazyNode) + @test haskey(doc[1], "a") == true + @test haskey(doc[1], "b") == false + end + + @testset "keys" begin + doc = parse("""""", LazyNode) + @test keys(doc[1]) == ["a", "b"] + end + + @testset "keys on element with no attributes" begin + doc = parse("", LazyNode) + @test isempty(keys(doc[1])) + end + + @testset "keys on non-element" begin + doc = parse("text", LazyNode) + @test keys(children(doc[1])[1]) == () + end + + @testset "Text value" begin + doc = parse("hello", LazyNode) + ch = children(doc[1]) + @test nodetype(ch[1]) == Text + @test value(ch[1]) == "hello" + end + + @testset "Text value unescapes entities" begin + doc = parse("& < >", LazyNode) + @test value(children(doc[1])[1]) == "& < >" + end + + @testset "Comment value" begin + doc = parse("", LazyNode) + c = children(doc[1])[1] + @test nodetype(c) == Comment + @test value(c) == " a comment " + end + + @testset "CData value" begin + doc = parse("]]>", LazyNode) + cd = children(doc[1])[1] + @test nodetype(cd) == CData + @test value(cd) == "raw " + end + + @testset "DTD value" begin + doc = parse("""""", LazyNode) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + end + + @testset "ProcessingInstruction tag and value" begin + doc = parse("", LazyNode) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "mypi" + @test value(pi) == "some data" + end + + @testset "ProcessingInstruction with no content" begin + doc = parse("", LazyNode) + pi = doc[1] + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "value returns nothing for Element/Document" begin + doc = parse("", LazyNode) + @test value(doc) === nothing + @test value(doc[1]) === nothing + end + + @testset "Element children" begin + doc = parse("", LazyNode) + root = doc[1] + @test length(children(root)) == 3 + @test tag(children(root)[1]) == "a" + @test tag(children(root)[2]) == "b" + @test tag(children(root)[3]) == "c" + end + + @testset "self-closing element has no children" begin + doc = parse("
", LazyNode) + br = children(doc[1])[1] + @test isempty(children(br)) + end + + @testset "non-element children returns empty tuple" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test children(text_node) == () + end + + @testset "nested elements" begin + doc = parse("
deep", LazyNode) + @test tag(doc[1]) == "a" + @test tag(doc[1][1]) == "b" + @test tag(doc[1][1][1]) == "c" + @test simple_value(doc[1][1][1]) == "deep" + end + + @testset "mixed content children" begin + xml = "text" + doc = parse(xml, LazyNode) + ch = children(doc[1]) + types = map(nodetype, ch) + @test Text in types + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "integer indexing" begin + doc = parse("", LazyNode) + @test tag(doc[1][1]) == "a" + @test tag(doc[1][2]) == "b" + @test tag(doc[1][3]) == "c" + end + + @testset "colon indexing" begin + doc = parse("", LazyNode) + all = doc[1][:] + @test length(all) == 2 + end + + @testset "lastindex" begin + doc = parse("", LazyNode) + @test tag(doc[1][end]) == "c" + end + + @testset "only" begin + doc = parse("", LazyNode) + @test tag(only(doc[1])) == "only" + end + + @testset "length" begin + doc = parse("", LazyNode) + @test length(doc[1]) == 3 + end + + @testset "is_simple" begin + doc = parse("text", LazyNode) + simple = children(doc[1])[1] + complex = children(doc[1])[2] + @test is_simple(simple) + @test !is_simple(complex) + end + + @testset "is_simple with attributes" begin + doc = parse("""text""", LazyNode) + @test !is_simple(children(doc[1])[1]) + end + + @testset "is_simple with CData child" begin + doc = parse("", LazyNode) + @test is_simple(children(doc[1])[1]) + end + + @testset "is_simple returns false for non-element" begin + doc = parse("text", LazyNode) + @test !is_simple(children(doc[1])[1]) + end + + @testset "simple_value" begin + doc = parse("hello", LazyNode) + @test simple_value(children(doc[1])[1]) == "hello" + end + + @testset "simple_value errors on non-simple" begin + doc = parse("", LazyNode) + @test_throws ErrorException simple_value(children(doc[1])[1]) + end + + @testset "simple_value errors on non-element" begin + doc = parse("text", LazyNode) + @test_throws ErrorException simple_value(children(doc[1])[1]) + end + + @testset "show Document" begin + doc = parse("", LazyNode) + s = sprint(show, doc) + @test contains(s, "Lazy") + @test contains(s, "Document") + @test contains(s, "1 child") + end + + @testset "show Document multiple children" begin + doc = parse("", LazyNode) + s = sprint(show, doc) + @test contains(s, "2 children") + end + + @testset "show Element" begin + doc = parse("""""", LazyNode) + s = sprint(show, doc[1]) + @test contains(s, "Lazy Element") + @test contains(s, "hello", LazyNode) + s = sprint(show, children(doc[1])[1]) + @test contains(s, "Lazy Text") + @test contains(s, "hello") + end + + @testset "show Comment" begin + doc = parse("", LazyNode) + s = sprint(show, children(doc[1])[1]) + @test contains(s, "Lazy Comment") + @test contains(s, " + + + text content + inner + + + + +""" + doc = parse(xml, LazyNode) + @test nodetype(doc) == Document + + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Comment + @test nodetype(typed[4]) == ProcessingInstruction + @test nodetype(typed[5]) == Element + + root = typed[5] + @test tag(root) == "root" + @test root["attr"] == "val" + + inner = children(root) + inner_types = map(nodetype, inner) + @test Text in inner_types + @test Element in inner_types + @test CData in inner_types + @test Comment in inner_types + @test ProcessingInstruction in inner_types + + child_els = filter(x -> nodetype(x) == Element, inner) + @test length(child_els) == 2 + @test tag(child_els[1]) == "child" + @test simple_value(child_els[1]) == "inner" + @test tag(child_els[2]) == "empty" + end end +include("test_pugixml.jl") +include("test_libexpat.jl") +include("test_w3c.jl") diff --git a/test/test_libexpat.jl b/test/test_libexpat.jl new file mode 100644 index 0000000..9ac8955 --- /dev/null +++ b/test/test_libexpat.jl @@ -0,0 +1,389 @@ +# Test cases inspired by libexpat (https://github.com/libexpat/libexpat, MIT license) +# Translated from expat/tests/basic_tests.c + +using XML +using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration, DTD +using XML: tag, value, children, attributes, simple_value +using Test + +@testset "libexpat-inspired" begin + + #==========================================================================# + # Character References # + #==========================================================================# + @testset "Decimal character references" begin + doc = parse("éè", Node) + @test simple_value(children(doc)[1]) == "éè" + end + + @testset "Hex character references" begin + doc = parse("éè", Node) + @test simple_value(children(doc)[1]) == "éè" + end + + @testset "Mixed char refs and text" begin + doc = parse("abcdef", Node) + @test simple_value(children(doc)[1]) == "abcdef" + end + + @testset "Large Unicode code points" begin + # CJK Unified Ideograph + doc = parse("世界", Node) + @test simple_value(children(doc)[1]) == "世界" + end + + #==========================================================================# + # UTF-8 Content # + #==========================================================================# + @testset "UTF-8 BOM" begin + bom = "\xef\xbb\xbf" + doc = parse(bom * "", Node) + @test nodetype(doc) == Document + end + + @testset "UTF-8 element content" begin + doc = parse("Ünïcödé", Node) + @test simple_value(children(doc)[1]) == "Ünïcödé" + end + + @testset "UTF-8 in attribute values" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "café" + end + + @testset "UTF-8 element names" begin + # XML.jl tokenizer does not yet support non-ASCII characters in element names + @test_broken try + parse("", Node) + true + catch + false + end + end + + @testset "Multi-byte UTF-8 sequences" begin + # 2-byte: ñ (U+00F1) + doc = parse("ñ", Node) + @test simple_value(children(doc)[1]) == "ñ" + + # 3-byte: 世 (U+4E16) + doc = parse("", Node) + @test simple_value(children(doc)[1]) == "世" + + # 4-byte: 𤭢 (U+24B62) + doc = parse("𤭢", Node) + @test simple_value(children(doc)[1]) == "𤭢" + end + + #==========================================================================# + # CDATA # + #==========================================================================# + @testset "Basic CDATA" begin + doc = parse("Hello!]]>", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 1 + @test value(cdata[1]) == "Hello!" + end + + @testset "CDATA with special characters" begin + doc = parse("\"']]>", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "&<>\"'" + end + + @testset "Multiple CDATA sections" begin + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 2 + @test value(cdata[1]) == "first" + @test value(cdata[2]) == "second" + end + + @testset "CDATA containing ]]" begin + # ]] without > is valid inside CDATA + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "data]]with]]brackets" + end + + @testset "CDATA errors" begin + @test_throws Exception parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test length(decls) == 1 + @test decls[1]["version"] == "1.0" + end + + @testset "XML declaration with encoding" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["encoding"] == "UTF-8" + end + + @testset "XML declaration with standalone" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["standalone"] == "yes" + end + + @testset "Full XML declaration" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["version"] == "1.0" + @test decls[1]["encoding"] == "UTF-8" + @test decls[1]["standalone"] == "no" + end + + #==========================================================================# + # Processing Instructions # + #==========================================================================# + @testset "Processing instructions" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + root = children(doc)[1] + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root)) + @test length(pis) == 1 + end + + @testset "PI with no data" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + end + + #==========================================================================# + # Comments # + #==========================================================================# + @testset "Comments in various positions" begin + # In prolog + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + + # Inside element + doc = parse("", Node) + root = children(doc)[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 1 + + # After root element + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + end + + @testset "Comment with special content" begin + doc = parse("", Node) + root = children(doc)[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test contains(value(comments[1]), "") + @test contains(value(comments[1]), "¬-entity;") + end + + #==========================================================================# + # DTD / DOCTYPE # + #==========================================================================# + @testset "DOCTYPE with internal subset" begin + xml = """ + +]> +text""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + root = filter(x -> nodetype(x) == Element, children(doc))[1] + @test tag(root) == "doc" + @test root["attr"] == "value" + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) == 1 + @test value(text_nodes[1]) == "text" + end + + @testset "DOCTYPE with SYSTEM" begin + doc = parse("", Node) + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + end + + @testset "DOCTYPE with PUBLIC" begin + doc = parse("""""", Node) + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + end + + #==========================================================================# + # Entity Handling # + #==========================================================================# + @testset "Predefined entities" begin + doc = parse("<>&'"", Node) + @test simple_value(children(doc)[1]) == "<>&'\"" + end + + @testset "Entities in attribute values" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "" + end + + @testset "Mixed entities and text" begin + doc = parse("Hello & welcome <user>", Node) + @test simple_value(children(doc)[1]) == "Hello & welcome " + end + + #==========================================================================# + # Attribute Edge Cases # + #==========================================================================# + @testset "Empty attribute value" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "" + + doc = parse("""""", Node) + @test children(doc)[1]["attr"] == "" + end + + @testset "Attribute with entities" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "a&b" + end + + @testset "Multiple attributes" begin + doc = parse("""""", Node) + el = children(doc)[1] + @test el["a"] == "1" + @test el["b"] == "2" + @test el["c"] == "3" + @test el["d"] == "4" + @test el["e"] == "5" + end + + @testset "Attribute error: duplicate" begin + @test_throws Exception parse("""""", Node) + end + + #==========================================================================# + # Nesting & Structure # + #==========================================================================# + @testset "Deeply nested elements" begin + xml = "deep" + doc = parse(xml, Node) + @test nodetype(doc) == Document + end + + @testset "Many sibling elements" begin + items = join(["$i" for i in 1:100]) + xml = "$items" + doc = parse(xml, Node) + root = children(doc)[1] + els = filter(x -> nodetype(x) == Element, children(root)) + @test length(els) == 100 + @test simple_value(els[1]) == "1" + @test simple_value(els[100]) == "100" + end + + @testset "Mismatched tags" begin + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("
", Node) + end + + @testset "Unclosed elements" begin + @test_throws Exception parse("", Node) + @test_throws Exception parse("text", Node) + end + + #==========================================================================# + # Line Endings # + #==========================================================================# + @testset "Various line endings in content" begin + # CR, LF, CRLF should all work + doc = parse("line1\nline2", Node) + @test nodetype(doc) == Document + + doc = parse("line1\rline2", Node) + @test nodetype(doc) == Document + + doc = parse("line1\r\nline2", Node) + @test nodetype(doc) == Document + end + + #==========================================================================# + # Empty Document Parts # + #==========================================================================# + @testset "Empty root element" begin + doc = parse("", Node) + root = children(doc)[1] + @test tag(root) == "doc" + @test isempty(filter(x -> nodetype(x) == Element, children(root))) + end + + @testset "Element with only whitespace" begin + doc = parse(" \n\t ", Node) + @test nodetype(doc) == Document + end + + @testset "Element with only comments" begin + doc = parse("", Node) + root = children(doc)[1] + els = filter(x -> nodetype(x) == Element, children(root)) + @test isempty(els) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 2 + end + + #==========================================================================# + # Namespace-like Attributes # + #==========================================================================# + @testset "xmlns declarations" begin + doc = parse("""""", Node) + root = children(doc)[1] + @test root["xmlns"] == "http://example.com" + @test root["xmlns:ns"] == "http://example.com/ns" + els = filter(x -> nodetype(x) == Element, children(root)) + @test tag(els[1]) == "ns:child" + end + + @testset "Namespaced attributes" begin + doc = parse("""""", Node) + root = children(doc)[1] + @test root["xml:lang"] == "en" + @test root["xml:space"] == "preserve" + end + + #==========================================================================# + # Large Content # + #==========================================================================# + @testset "Long attribute value" begin + long_val = repeat("x", 10_000) + doc = parse("", Node) + @test children(doc)[1]["attr"] == long_val + end + + @testset "Long text content" begin + long_text = repeat("Hello World! ", 1000) + doc = parse("$long_text", Node) + @test simple_value(children(doc)[1]) == long_text + end + + @testset "Long CDATA" begin + long_cdata = repeat("data<>& ", 1000) + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == long_cdata + end +end diff --git a/test/test_libxml2_testcases.jl b/test/test_libxml2_testcases.jl new file mode 100644 index 0000000..0b8a89a --- /dev/null +++ b/test/test_libxml2_testcases.jl @@ -0,0 +1,1578 @@ +# Test cases borrowed from the libxml2 test suite (https://github.com/GNOME/libxml2). +# +# libxml2 is Copyright (C) the GNOME Project and contributors, licensed under the MIT License. +# These test cases are adapted for the XML.jl Julia package. +# +# Categories mirror the libxml2 test/ directory structure: +# - CDATA handling +# - Comments +# - Processing instructions +# - Attributes (normalization, entities, quoting) +# - Namespaces +# - DTD / internal subset +# - Entity references (character refs, predefined, internal general) +# - Whitespace / blank handling +# - Well-formedness (boundaries, big names, mixed content) +# - Error cases (must fail to parse) + +using XML +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using XML: escape, unescape +using Test + +@testset "libxml2 test cases" begin + +#==============================================================================# +# CDATA SECTIONS # +# From: test/cdata, test/cdata2, test/adjacent-cdata.xml, # +# test/emptycdata.xml, test/cdata-*-byte-UTF-8.xml # +#==============================================================================# +@testset "CDATA" begin + @testset "cdata: basic CDATA with markup characters" begin + # libxml2 test/cdata + xml = """\nHello, world!]]>\n""" + doc = parse(xml, Node) + root = doc[1] + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) >= 1 + @test value(cdata_nodes[1]) == "Hello, world!" + end + + @testset "cdata2: nested CDATA-like content" begin + # libxml2 test/cdata2 - tests ]]> escaping pattern + xml = """ + + ]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "collection" + end + + @testset "adjacent-cdata: three adjacent CDATA sections" begin + # libxml2 test/adjacent-cdata.xml + xml = "" + doc = parse(xml, Node) + root = doc[1] + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 3 + @test value(cdata_nodes[1]) == "abc" + @test value(cdata_nodes[2]) == "def" + @test value(cdata_nodes[3]) == "ghi" + end + + @testset "emptycdata: empty CDATA section in namespaced doc" begin + # libxml2 test/emptycdata.xml + xml = """ + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "html" + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) >= 1 + @test value(cdata_nodes[1]) == "" + end + + @testset "cdata-2-byte-UTF-8: two-byte chars across buffer boundary" begin + # libxml2 test/cdata-2-byte-UTF-8.xml - tests Č (U+010C, 2 bytes in UTF-8) + long_c = repeat("Č", 400) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_c + end + + @testset "cdata-3-byte-UTF-8: three-byte chars across buffer boundary" begin + # libxml2 test/cdata-3-byte-UTF-8.xml - tests 牛 (U+725B, 3 bytes in UTF-8) + long_cow = repeat("牛", 400) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_cow + end + + @testset "cdata-4-byte-UTF-8: four-byte chars across buffer boundary" begin + # libxml2 test/cdata-4-byte-UTF-8.xml - tests 🍦 (U+1F366, 4 bytes in UTF-8) + long_ice = repeat("🍦", 334) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_ice + end +end + +#==============================================================================# +# COMMENTS # +# From: test/comment.xml through test/comment6.xml, test/badcomment.xml # +#==============================================================================# +@testset "Comments" begin + @testset "comment: comments inside element" begin + # libxml2 test/comment.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 2 + @test contains(value(comments[1]), "document start") + @test contains(value(comments[2]), "document end") + end + + @testset "comment2: comments outside root element" begin + # libxml2 test/comment2.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + top_comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(top_comments) == 2 + @test contains(value(top_comments[1]), "document start") + @test contains(value(top_comments[2]), "document end") + end + + @testset "comment3: very long comment (buffer boundary test)" begin + # libxml2 test/comment3.xml - 150+ lines of repeated digits + lines = join([repeat("01234567890123456789012345678901234567890123456789", 1) for _ in 1:150], "\n") + comment_text = " test of very very long comments and buffer limits\n" * lines * "\n" + xml = """\n\n""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) >= 1 + @test length(value(comments[1])) > 7000 + end + + @testset "comment5: hyphens and line breaks in comments" begin + # libxml2 test/comment5.xml + xml = """ + +""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + @test contains(value(comments[1]), "hyphen") + @test contains(value(comments[1]), "- - -") + end + + @testset "comment6: comment before DOCTYPE" begin + # libxml2 test/comment6.xml + xml = """ + +]> +
""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Comment + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Element + end + + @testset "badcomment: comment with markup-like content" begin + # libxml2 test/badcomment.xml - note: libxml2 considers this valid XML + xml = """ + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "foo" + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) >= 1 + end + + @testset "comment4: non-ASCII characters in comments" begin + # libxml2 test/comment4.xml (adapted from ISO-8859-1 to UTF-8) + xml = """ + + + +""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 3 + @test contains(value(comments[1]), "là") + @test contains(value(comments[2]), "à") + end +end + +#==============================================================================# +# PROCESSING INSTRUCTIONS # +# From: test/pi.xml, test/pi2.xml # +#==============================================================================# +@testset "Processing Instructions" begin + @testset "pi: PIs inside root element" begin + # libxml2 test/pi.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root)) + @test length(pis) == 2 + @test tag(pis[1]) == "document-start" + @test value(pis[1]) == "doc" + @test tag(pis[2]) == "document-end" + @test value(pis[2]) == "doc" + end + + @testset "pi2: PIs outside root element" begin + # libxml2 test/pi2.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + top_pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(top_pis) == 2 + @test tag(top_pis[1]) == "document-start" + @test tag(top_pis[2]) == "document-end" + end +end + +#==============================================================================# +# ATTRIBUTES # +# From: test/att1 through test/att11, test/attrib.xml, # +# test/def-xml-attr.xml, test/defattr.xml # +#==============================================================================# +@testset "Attributes" begin + @testset "att1: attribute with newlines (whitespace normalization)" begin + # libxml2 test/att1 + xml = "" + doc = parse(xml, Node) + @test tag(doc[1]) == "doc" + @test haskey(doc[1], "attr") + end + + @testset "att2: attribute with multiple spaces" begin + # libxml2 test/att2 + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "to normalize with a space" + end + + @testset "att3: attribute with character references" begin + # libxml2 test/att3 + xml = """""" + doc = parse(xml, Node) + @test tag(doc[1]) == "select" + @test haskey(doc[1], "onclick") + end + + @testset "att4: complex document with many attributes" begin + # Adapted from libxml2 test/att4 (electroxml document) + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "electroxml" + @test root["modified"] == "20021216T072726" + end + + @testset "attrib: attribute with entities and char refs" begin + # libxml2 test/attrib.xml + xml = """""" + doc = parse(xml, Node) + @test tag(doc[1]) == "item" + @test doc[1]["url"] == "http://example.com/" + @test doc[1]["visits"] == "1" + end + + @testset "att5: attribute with empty value" begin + # Adapted from libxml2 test/att5 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["a"] == "" + @test root["b"] == "val" + end + + @testset "att9: attribute with single quotes in double-quoted value" begin + # libxml2 test/att9 pattern + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "it's a test" + end + + @testset "att10: attribute with double quotes in single-quoted value" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "he said \"hello\"" + end + + @testset "att11: attribute values with entity refs" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "" + @test doc[1]["b"] == "a&b" + end + + @testset "def-xml-attr: xml:lang default attribute in DTD" begin + # libxml2 test/def-xml-attr.xml (just verify parsing doesn't fail) + xml = """ + + +]> + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "root" + end +end + +#==============================================================================# +# NAMESPACES # +# From: test/ns through test/ns7, test/namespaces/err_*.xml, # +# test/nsclean.xml, test/entity-in-ns-uri.xml # +#==============================================================================# +@testset "Namespaces" begin + @testset "ns: namespace with prefix on element and attribute" begin + # libxml2 test/ns + xml = """ + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "dia:diagram" + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "dia:diagramdata" + @test child["dia:testattr"] == "test" + end + + @testset "ns2: namespace on self-closing element" begin + # libxml2 test/ns2 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "dia:diagram" + @test root["dia:testattr"] == "test" + end + + @testset "ns3: xmlns declared after prefixed attribute" begin + # libxml2 test/ns3 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["dia:testattr"] == "test" + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + end + + @testset "ns4: xml:lang, xml:link, xml:space built-in attributes" begin + # libxml2 test/ns4 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["xml:lang"] == "en" + @test root["xml:space"] == "preserve" + end + + @testset "ns5: default namespace on element with prefix on another" begin + # libxml2 test/ns5 + xml = """ + +""" + doc = parse(xml, Node) + root = doc[1] + @test root["xmlns"] == "http://example.org/ns/1" + @test root["xmlns:rng"] == "http://example.org/ns/1" + @test root["name"] == "foo" + end + + @testset "ns6: default namespace on child, not on sibling" begin + # libxml2 test/ns6 + xml = """ + + +""" + doc = parse(xml, Node) + root = doc[1] + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "foo" + @test elements[1]["xmlns"] == "http://abc" + @test tag(elements[2]) == "bar" + end + + @testset "ns7: xml: prefix element (built-in)" begin + # libxml2 test/ns7 + xml = "" + doc = parse(xml, Node) + @test tag(doc[1]) == "xml:test" + end + + @testset "multiple namespace prefixes" begin + xml = """ + + +""" + doc = parse(xml, Node) + root = doc[1] + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "a:child" + @test elements[1]["a:attr"] == "1" + @test tag(elements[2]) == "b:child" + @test elements[2]["b:attr"] == "2" + end + + @testset "namespace redeclaration on nested element" begin + xml = """ + + + +""" + doc = parse(xml, Node) + root = doc[1] + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test child["xmlns:a"] == "http://second.com" + end +end + +#==============================================================================# +# DTD / INTERNAL SUBSET # +# From: test/dtd1 through test/dtd13, test/intsubset.xml, # +# test/intsubset2.xml # +#==============================================================================# +@testset "DTD / Internal Subset" begin + @testset "dtd1: DOCTYPE with PUBLIC id" begin + # libxml2 test/dtd1 + xml = """ + + +""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "MEMO") + @test contains(value(dtd), "PUBLIC") + end + + @testset "dtd2: simple internal subset with ELEMENT declaration" begin + # libxml2 test/dtd2 + xml = """ +]> +This is a valid document !""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ELEMENT") + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test simple_value(root) == "This is a valid document !" + end + + @testset "dtd3: ANY content model" begin + # libxml2 test/dtd3 + xml = """ +]> +This is a valid document !""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ANY") + end + + @testset "dtd4: EMPTY content model" begin + # libxml2 test/dtd4 + xml = """ +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + @test length(children(root)) == 0 + end + + @testset "dtd5: mixed content model" begin + # libxml2 test/dtd5 + xml = """ + + +]> +This is a valid document
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + @test tag(elements[1]) == "a" + @test tag(elements[2]) == "b" + end + + @testset "dtd6: choice content model" begin + # libxml2 test/dtd6 + xml = """ + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 3 + end + + @testset "dtd7: sequence content model" begin + # libxml2 test/dtd7 + xml = """ + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + @test tag(elements[1]) == "a" + @test tag(elements[2]) == "b" + end + + @testset "dtd8: nested choice and sequence" begin + # libxml2 test/dtd8 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "b" + @test tag(elements[2]) == "c" + end + + @testset "dtd9: optional content model" begin + # libxml2 test/dtd9 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + end + + @testset "dtd10: mixed repetition content model" begin + # libxml2 test/dtd10 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 3 + end + + @testset "dtd11: ATTLIST with CDATA #IMPLIED" begin + # libxml2 test/dtd11 + xml = """ + +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["val"] == "v1" + end + + @testset "dtd12: nested entity references" begin + # libxml2 test/dtd12 - entity referencing another entity + xml = """ + +]> +&WhatHeSaid;""" + # This may or may not expand depending on XML.jl's entity handling + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "dtd13: comments before and after DOCTYPE" begin + # libxml2 test/dtd13 + xml = """ + +]> + +""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Comment + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Comment + @test nodetype(typed[4]) == Element + end + + @testset "intsubset: internal subset with comment containing quote" begin + # libxml2 test/intsubset.xml + xml = """ + + +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "root" + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ELEMENT") + end +end + +#==============================================================================# +# ENTITY REFERENCES # +# From: test/ent1 through test/ent11, test/ent6hex # +#==============================================================================# +@testset "Entity References" begin + @testset "ent1: internal general entity declaration and use" begin + # libxml2 test/ent1 + xml = """ + +]> + + &xml; +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "EXAMPLE" + end + + @testset "ent3: entity refs in attribute values" begin + # libxml2 test/ent3 + xml = """ + +]> + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["prop1"] == "a&b" + @test root["prop2"] == "c + + This is an inverted exclamation sign ¡ + This is a space +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + text = join([value(c) for c in children(root) if nodetype(c) == Text]) + @test contains(text, "\u00A1") # ¡ + @test contains(text, " ") # space ( ) + end + + @testset "ent6: predefined entities with double-escaping" begin + # libxml2 test/ent6 + xml = """ + + + + +]> +<""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "ent8: multiple entities in one document" begin + # libxml2 test/ent8 + xml = """ + +]> + +&test1;&test2; +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "predefined entities in text content" begin + xml = "& < > ' "" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "& < > ' \"" + end + + @testset "predefined entities in attributes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "&" + @test doc[1]["b"] == "<" + @test doc[1]["c"] == ">" + @test doc[1]["d"] == "'" + @test doc[1]["e"] == "\"" + end + + @testset "decimal character references" begin + xml = "ABC" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "ABC" + end + + @testset "hexadecimal character references" begin + xml = "ABC" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "ABC" + end + + @testset "mixed hex and decimal char refs" begin + xml = "Hello" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "Hello" + end + + @testset "char ref for non-ASCII: inverted exclamation" begin + xml = "¡" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\u00A1" + end + + @testset "char ref for CJK character" begin + xml = "" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "中" + end + + @testset "char ref for emoji" begin + xml = "😀" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\U0001F600" + end +end + +#==============================================================================# +# WHITESPACE / BLANK HANDLING # +# From: test/tstblanks.xml, test/title.xml # +#==============================================================================# +@testset "Whitespace / Blank Handling" begin + @testset "title: simple document with encoding" begin + # libxml2 test/title.xml + xml = """ +my title""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "title" + @test simple_value(root) == "my title" + end + + @testset "whitespace preservation in text content" begin + xml = " hello world " + doc = parse(xml, Node) + @test simple_value(doc[1]) == " hello world " + end + + @testset "tab and newline preservation" begin + xml = "\t\n\ttabbed\n" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\t\n\ttabbed\n" + end + + @testset "whitespace-only text node" begin + xml = " " + doc = parse(xml, Node) + @test simple_value(doc[1]) == " " + end + + @testset "inter-element whitespace preserved" begin + xml = "\n \n \n" + doc = parse(xml, Node) + root = doc[1] + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) >= 1 + end +end + +#==============================================================================# +# WELL-FORMED DOCUMENTS # +# From: test/boundaries1.xml, test/bigname.xml, test/bigname2.xml, # +# test/slashdot.xml, test/eve.xml, test/wap.xml, etc. # +#==============================================================================# +@testset "Well-Formed Documents" begin + @testset "boundaries1: boundary conditions with entities and CDATA" begin + # libxml2 test/boundaries1.xml (simplified - without DTD entity expansion) + xml = """ +"> + '> +]> + + +text + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "d" + @test root["a"] == ">" + @test root["b"] == ">" + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 1 + @test value(cdata_nodes[1]) == "cdata" + end + + @testset "bigname: very long element name" begin + # libxml2 test/bigname.xml - element name with >10000 characters + longname = "this_is_a_very_large_name_" * repeat("0123456789", 500) * "_end" + xml = "<$(longname)/>" + doc = parse(xml, Node) + @test tag(doc[1]) == longname + end + + @testset "slashdot: real-world XML (ultramode feed)" begin + # libxml2 test/slashdot.xml (simplified) + xml = """ + + 100 Mbit/s on Fibre to the home + http://slashdot.org/articles/99/06/06/1440211.shtml + + CmdrTaco + wouldn't-it-be-nice + internet + 20 +
articles
+ topicinternet.jpg +
+ + Gimp 1.2 Preview + http://slashdot.org/articles/99/06/06/1438246.shtml + + CmdrTaco + stuff-to-read + gimp + 12 +
articles
+ topicgimp.gif +
+
""" + doc = parse(xml, Node) + root = doc[1] + @test tag(root) == "ultramode" + stories = filter(x -> nodetype(x) == Element && tag(x) == "story", children(root)) + @test length(stories) == 2 + title1 = first(filter(x -> nodetype(x) == Element && tag(x) == "title", + children(stories[1]))) + @test simple_value(title1) == "100 Mbit/s on Fibre to the home" + end + + @testset "eve: document with external DTD reference and internal entity" begin + # libxml2 test/eve.xml + xml = """ + +]> + +""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "PUBLIC") + @test contains(value(dtd), "ENTITY") + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "spec" + end + + @testset "deeply nested document" begin + xml = "
deep" + doc = parse(xml, Node) + @test simple_value(doc[1][1][1][1][1][1][1][1][1][1]) == "deep" + end + + @testset "many sibling elements" begin + items = join(["Item $i" for i in 1:200]) + xml = "$items" + doc = parse(xml, Node) + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test length(elements) == 200 + @test elements[1]["n"] == "1" + @test elements[200]["n"] == "200" + end + + @testset "mixed content: text, elements, CDATA, comments, PIs" begin + xml = """ + text before + child text + + + + text after +""" + doc = parse(xml, Node) + root = doc[1] + types = Set(nodetype(c) for c in children(root)) + @test Text in types + @test Element in types + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + end + + @testset "self-closing elements" begin + xml = "

" + doc = parse(xml, Node) + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test length(elements) == 3 + @test tag(elements[1]) == "br" + @test tag(elements[2]) == "hr" + @test tag(elements[3]) == "img" + @test all(x -> length(children(x)) == 0, elements) + end + + @testset "empty element: start-tag and end-tag" begin + xml = "" + doc = parse(xml, Node) + el = first(filter(x -> nodetype(x) == Element, children(doc[1]))) + @test tag(el) == "empty" + end + + @testset "element names with hyphens, dots, underscores" begin + xml = "<_private/>" + doc = parse(xml, Node) + @test tag(doc[1]) == "my-root" + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test tag(elements[1]) == "sub.element" + @test tag(elements[2]) == "_private" + end + + @testset "element names starting with underscore" begin + xml = "<_root><__child/>" + doc = parse(xml, Node) + @test tag(doc[1]) == "_root" + end + + @testset "numeric element names (with letter prefix)" begin + xml = "

heading

" + doc = parse(xml, Node) + @test tag(doc[1]) == "h1" + @test simple_value(doc[1]) == "heading" + end +end + +#==============================================================================# +# ROUNDTRIP: PARSE → WRITE → PARSE # +# Tests that libxml2-style documents survive roundtrip processing # +#==============================================================================# +@testset "Roundtrip" begin + @testset "roundtrip: namespaced document" begin + xml = """ + + +""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + end + + @testset "roundtrip: DTD with internal subset" begin + xml = """ +]> +text""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc2))) + @test contains(value(dtd), "ELEMENT") + end + + @testset "roundtrip: adjacent CDATA sections" begin + xml = "" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 2 + end + + @testset "roundtrip: processing instructions" begin + xml = """ + + +""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc2)) + @test length(pis) == 2 + end + + @testset "roundtrip: comments with special characters" begin + xml = "" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 1 + end + + @testset "roundtrip: entities in attributes" begin + xml = """""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["a"] == "a&b" + @test root["b"] == "cx", Node) + @test nodetype(doc) == Document + end + + @testset "errors/invalid-start-tag-1: text-only document" begin + # libxml2 test/errors/invalid-start-tag-1.xml + # XML.jl is lenient: treats bare text as a Text node + doc = parse("x", Node) + @test nodetype(doc) == Document + end + + @testset "errors/invalid-start-tag-2: lone <" begin + # libxml2 test/errors/invalid-start-tag-2.xml + @test_throws Exception parse("<", Node) + end + + @testset "errors/doctype1: malformed DOCTYPE" begin + # libxml2 test/errors/doctype1.xml - "[]>" + # XML.jl is lenient: parses the DOCTYPE and treats []> as text + doc = parse("[]>\n", Node) + @test nodetype(doc) == Document + end + + @testset "errors/dup-xml-attr: duplicate xml: attribute" begin + # libxml2 test/errors/dup-xml-attr.xml + @test_throws Exception parse("""""", Node) + end + + @testset "errors/attr5: duplicate attribute" begin + # libxml2 test/errors/attr5.xml + @test_throws Exception parse(""" + +""", Node) + end + + @testset "mismatched tags" begin + @test_throws Exception parse("
", Node) + end + + @testset "overlapping elements" begin + @test_throws Exception parse("", Node) + end + + @testset "unclosed root element" begin + @test_throws Exception parse("", Node) + end + + @testset "close tag without open" begin + @test_throws Exception parse("", Node) + end + + @testset "unclosed comment" begin + @test_throws Exception parse("
" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc[1])) + @test contains(value(comments[1]), "héllo") + end + + @testset "Unicode in CDATA" begin + xml = "" + doc = parse(xml, Node) + cdata = first(filter(x -> nodetype(x) == CData, children(doc[1]))) + @test value(cdata) == "日本語テスト" + end + + @testset "Unicode in PI content" begin + xml = "" + doc = parse(xml, Node) + pi = first(filter(x -> nodetype(x) == ProcessingInstruction, children(doc[1]))) + @test contains(value(pi), "données") + end + + @testset "UTF-8 BOM handling" begin + # libxml2 test/utf8bom.xml pattern + xml = "\xef\xbb\xbf\n" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end +end + +#==============================================================================# +# REAL-WORLD DOCUMENT PATTERNS # +# Patterns commonly tested by libxml2 (DAV, RDF, SOAP, SVG, etc.) # +#==============================================================================# +@testset "Real-World Document Patterns" begin + @testset "WebDAV-like document" begin + # Inspired by libxml2 test/dav* series + xml = """ + + + /container/ + + + Example collection + + + HTTP/1.1 200 OK + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "D:multistatus" + @test root["xmlns:D"] == "DAV:" + end + + @testset "RDF-like document" begin + # Inspired by libxml2 test/rdf1, test/rdf2 + xml = """ + + + Example Resource + John Doe + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "rdf:RDF" + desc = first(filter(x -> nodetype(x) == Element, children(root))) + @test desc["rdf:about"] == "http://example.org/resource" + end + + @testset "SVG-like document" begin + # Inspired by libxml2 test/svg1, test/svg2, test/svg3 + xml = """ + + + + + + + + + + Hello SVG +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "svg" + @test root["xmlns"] == "http://www.w3.org/2000/svg" + @test root["width"] == "200" + end + + @testset "SOAP-like envelope" begin + xml = """ + + + + New York + US + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "soap:Envelope" + end + + @testset "Atom feed" begin + xml = """ + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "feed" + @test root["xmlns"] == "http://www.w3.org/2005/Atom" + end + + @testset "plist-like document" begin + xml = """ + + + + Name + Example + Version + 42 + Enabled + + Tags + + alpha + beta + + +""" + doc = parse(xml, Node) + plist = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(plist) == "plist" + @test plist["version"] == "1.0" + end + + @testset "XHTML with mixed content" begin + xml = """ + + Test + +

This is emphasized and strong text.

+

A link: click here.

+
+
  preformatted  text  
+ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "html" + @test root["xmlns"] == "http://www.w3.org/1999/xhtml" + end + + @testset "MathML-like document" begin + xml = """ + + x2 + + + y2 + = + z2 + +""" + doc = parse(xml, Node) + root = doc[1] + @test tag(root) == "math" + @test root["xmlns"] == "http://www.w3.org/1998/Math/MathML" + end + + @testset "WML-like document (mobile)" begin + # Inspired by libxml2 test/wml.xml + xml = """ + + + +

Welcome to WML

+
+
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "wml" + end +end + +#==============================================================================# +# EDGE CASES # +# Additional edge cases inspired by libxml2 test patterns # +#==============================================================================# +@testset "Edge Cases" begin + @testset "CDATA containing ]] not followed by >" begin + xml = "" + doc = parse(xml, Node) + cdata = first(filter(x -> nodetype(x) == CData, children(doc[1]))) + @test value(cdata) == "a]]b" + end + + @testset "comment containing --" begin + # Note: -- inside comments is technically not well-formed per spec, + # but many parsers tolerate single - characters + xml = "" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc[1])) + @test length(comments) == 1 + end + + @testset "attribute value containing >" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "a>b" + end + + @testset "attribute value containing single quote in double quotes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "it's" + end + + @testset "attribute value containing double quote in single quotes" begin + xml = "" + doc = parse(xml, Node) + @test doc[1]["attr"] == "say \"hello\"" + end + + @testset "very long attribute value" begin + long_val = repeat("x", 10000) + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == long_val + end + + @testset "very long text content" begin + long_text = repeat("word ", 5000) + xml = "$(long_text)" + doc = parse(xml, Node) + @test simple_value(doc[1]) == long_text + end + + @testset "many attributes on one element" begin + attrs = join(["a$i=\"v$i\"" for i in 1:50], " ") + xml = "" + doc = parse(xml, Node) + @test doc[1]["a1"] == "v1" + @test doc[1]["a50"] == "v50" + end + + @testset "whitespace around = in attributes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "tab and newline in tag whitespace" begin + xml = "" + doc = parse(xml, Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "empty element: self-closing vs open-close" begin + xml1 = "" + xml2 = "" + doc1 = parse(xml1, Node) + doc2 = parse(xml2, Node) + # Both should produce empty elements + el1 = first(filter(x -> nodetype(x) == Element, children(doc1[1]))) + el2 = first(filter(x -> nodetype(x) == Element, children(doc2[1]))) + @test tag(el1) == tag(el2) == "x" + end + + @testset "document with all prolog components" begin + xml = """ + + + + +]> + + +&greeting;""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + type_list = map(nodetype, typed) + @test Declaration in type_list + @test DTD in type_list + @test Comment in type_list + @test ProcessingInstruction in type_list + @test Element in type_list + end +end + +end # top-level @testset diff --git a/test/test_pugixml.jl b/test/test_pugixml.jl new file mode 100644 index 0000000..6e46d5a --- /dev/null +++ b/test/test_pugixml.jl @@ -0,0 +1,308 @@ +# Test cases inspired by pugixml (https://github.com/zeux/pugixml, MIT license) +# Translated from tests/test_parse.cpp and tests/test_xpath.cpp + +using XML +using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration +using XML: tag, value, children, attributes, simple_value, xpath +using Test + +@testset "pugixml-inspired" begin + + #==========================================================================# + # Processing Instructions # + #==========================================================================# + @testset "PI parsing" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + end + + @testset "PI errors" begin + # XML.jl is lenient about incomplete PIs without a root element, + # but these should fail when embedded in a document + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # Comments # + #==========================================================================# + @testset "Comment parsing" begin + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + @test value(comments[1]) == "" + + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test value(comments[1]) == "value" + + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test contains(value(comments[1]), "multi") + end + + @testset "Comment errors" begin + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # CDATA # + #==========================================================================# + @testset "CDATA parsing" begin + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 1 + @test value(cdata[1]) == "" + + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "value" + + # CDATA preserves markup characters + doc = parse("Hello!]]>", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "Hello!" + end + + @testset "CDATA errors" begin + @test_throws Exception parse("", Node) + @test tag(children(doc)[1]) == "node" + + doc = parse("", Node) + @test tag(children(doc)[1]) == "node" + + doc = parse("", Node) + @test tag(children(doc)[1]) == "node" + end + + @testset "Tag hierarchy" begin + doc = parse("", Node) + root = children(doc)[1] + @test tag(root) == "node" + root_els = filter(x -> nodetype(x) == Element, children(root)) + @test length(root_els) == 2 + @test tag(root_els[1]) == "n1" + @test tag(root_els[2]) == "n3" + end + + @testset "Tag errors" begin + @test_throws Exception parse("<", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # Attribute Parsing # + #==========================================================================# + @testset "Attribute quotes" begin + doc = parse("", Node) + el = children(doc)[1] + @test el["id1"] == "v1" + @test el["id2"] == "v2" + end + + @testset "Attribute spaces around =" begin + doc = parse("", Node) + el = children(doc)[1] + @test el["id1"] == "v1" + @test el["id2"] == "v2" + @test el["id3"] == "v3" + @test el["id4"] == "v4" + end + + @testset "Attribute errors" begin + @test_throws Exception parse("", Node) + @test children(doc)[1]["id"] == "<>&'\"" + end + + @testset "Predefined entities in text" begin + doc = parse("<>&'"", Node) + @test simple_value(children(doc)[1]) == "<>&'\"" + end + + @testset "Numeric character references" begin + doc = parse(" ", Node) + @test simple_value(children(doc)[1]) == " " + end + + @testset "Unicode character references" begin + # Greek gamma + doc = parse("γ", Node) + @test simple_value(children(doc)[1]) == "γ" + + # Same char, lowercase hex + doc = parse("γ", Node) + @test simple_value(children(doc)[1]) == "γ" + end + + #==========================================================================# + # Whitespace # + #==========================================================================# + @testset "Whitespace text nodes preserved" begin + doc = parse(" ", Node) + root = children(doc)[1] + # Should have text nodes with whitespace + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) >= 1 + end + + @testset "PCDATA content" begin + doc = parse("text content", Node) + @test simple_value(children(doc)[1]) == "text content" + end + + #==========================================================================# + # Unicode / CJK Content # + #==========================================================================# + @testset "Unicode element names (CJK)" begin + # XML.jl tokenizer does not yet support CJK characters in element/attribute names + @test_broken try + parse("<汉语>世界", Node) + true + catch + false + end + end + + @testset "Unicode text content" begin + doc = parse("Ünïcödé café naïve", Node) + @test simple_value(children(doc)[1]) == "Ünïcödé café naïve" + end + + #==========================================================================# + # Mixed Content # + #==========================================================================# + @testset "Mixed text, CDATA, comments" begin + xml = "First textSecond textLast text" + doc = parse(xml, Node) + root = children(doc)[1] + child_types = map(nodetype, children(root)) + @test Text in child_types + @test Comment in child_types + @test CData in child_types + end + + #==========================================================================# + # Complex Document # + #==========================================================================# + @testset "Complex document with all node types" begin + xml = """ + + + + + + some text + + + + + + +""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + + root_els = filter(x -> nodetype(x) == Element, children(doc)) + @test length(root_els) == 1 + mesh = root_els[1] + @test tag(mesh) == "mesh" + @test mesh["name"] == "mesh_root" + + # Check inner content types + inner = children(mesh) + @test any(x -> nodetype(x) == Comment, inner) + @test any(x -> nodetype(x) == Text, inner) + @test any(x -> nodetype(x) == CData, inner) + @test any(x -> nodetype(x) == ProcessingInstruction, inner) + + nodes = filter(x -> nodetype(x) == Element && tag(x) == "node", inner) + @test length(nodes) == 2 + @test nodes[1]["attr1"] == "value1" + @test nodes[1]["attr2"] == "value2" + end + + #==========================================================================# + # XPath # + #==========================================================================# + @testset "XPath" begin + @testset "descendant with attribute predicate" begin + doc = parse("", Node) + results = xpath(doc, "//c[@id='b']") + @test length(results) == 1 + @test results[1]["id"] == "b" + end + + @testset "child with attribute" begin + doc = parse("", Node) + results = xpath(doc, "/a/c[@id]") + @test length(results) == 1 + @test results[1]["id"] == "b" + end + + @testset "wildcard with attribute predicate" begin + doc = parse("""test""", Node) + results = xpath(doc, "/node/*[@attr1]") + @test length(results) == 2 + end + + @testset "descendant-or-self with text()" begin + doc = parse("deep", Node) + results = xpath(doc, "//e/text()") + @test length(results) == 1 + @test value(results[1]) == "deep" + end + + @testset "positional predicate" begin + doc = parse("", Node) + results = xpath(doc, "/root/*[1]") + @test length(results) == 1 + @test tag(results[1]) == "a" + + results = xpath(doc, "/root/*[last()]") + @test length(results) == 1 + @test tag(results[1]) == "c" + end + + @testset "nested predicates" begin + doc = parse("""""", Node) + results = xpath(doc, "//subchild[@id]") + @test length(results) == 2 + end + end +end diff --git a/test/test_remote_files.jl b/test/test_remote_files.jl new file mode 100644 index 0000000..ed2b3e3 --- /dev/null +++ b/test/test_remote_files.jl @@ -0,0 +1,77 @@ +using XML +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using Downloads: download +using Test + +#==============================================================================# +# REMOTE XML FILE PARSING TESTS # +#==============================================================================# +# These tests download publicly available XML files and verify that XML.jl can +# parse them without error. A failed download (network issues, CI without +# internet, URL gone) is silently skipped — only parsing failures count as test +# failures. +# +# Not included in runtests.jl — run standalone: julia --project test/test_remote_files.jl + +function _try_download(url::AbstractString)::Union{String, Nothing} + try + path = download(url) + return read(path, String) + catch + return nothing + end +end + +const REMOTE_XML_URLS = [ + # ---- W3Schools example files ---- + ("W3Schools note.xml", "https://www.w3schools.com/xml/note.xml"), + ("W3Schools cd_catalog.xml", "https://www.w3schools.com/xml/cd_catalog.xml"), + ("W3Schools plant_catalog.xml", "https://www.w3schools.com/xml/plant_catalog.xml"), + ("W3Schools simple.xml", "https://www.w3schools.com/xml/simple.xml"), + ("W3Schools books.xml", "https://www.w3schools.com/xml/books.xml"), + + # ---- W3C SVG samples ---- + ("W3C SVG helloworld.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/helloworld.svg"), + ("W3C SVG tiger.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/tiger.svg"), + ("W3C SVG w3c.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/w3c.svg"), + ("W3C SVG lineargradient2.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/lineargradient2.svg"), + ("W3C SVG heart.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/heart.svg"), + + # ---- GitHub-hosted XML files ---- + ("JUnit XML complete example", "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-complete.xml"), + ("JUnit XML basic example", "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-basic.xml"), + ("PEPPOL invoice base example", "https://raw.githubusercontent.com/OpenPEPPOL/peppol-bis-invoice-3/master/rules/examples/base-example.xml"), + + # ---- Maven Central POM (real-world XML with namespaces) ---- + ("Maven JUnit 4.13.2 POM", "https://repo1.maven.org/maven2/junit/junit/4.13.2/junit-4.13.2.pom"), + ("Maven Guava 33.0 POM", "https://repo1.maven.org/maven2/com/google/guava/guava/33.0.0-jre/guava-33.0.0-jre.pom"), + + # ---- NASA RSS feed (live XML) ---- + ("NASA news RSS feed", "https://www.nasa.gov/news-release/feed/"), +] + +@testset "Remote XML Parsing" begin + for (label, url) in REMOTE_XML_URLS + @testset "$label" begin + xml_str = _try_download(url) + if isnothing(xml_str) + @info "Skipping $label — download failed" url + @test_skip false + else + doc = parse(xml_str, Node) + @test nodetype(doc) == Document + @test length(children(doc)) > 0 + + # Verify at least one Element exists somewhere in the document + has_element = any(x -> nodetype(x) == Element, children(doc)) + @test has_element + + # Verify write produces output and can be re-parsed + xml_out = XML.write(doc) + @test length(xml_out) > 0 + doc2 = parse(xml_out, Node) + @test nodetype(doc2) == Document + end + end + end +end diff --git a/test/test_tokenizer.jl b/test/test_tokenizer.jl new file mode 100644 index 0000000..7c901fa --- /dev/null +++ b/test/test_tokenizer.jl @@ -0,0 +1,425 @@ +using Test, XML + +using XML.XMLTokenizer + +# Convenience: collect token kinds from a string +kinds(xml) = [t.kind for t in tokenize(xml)] +raws(xml) = [String(t.raw) for t in tokenize(xml)] + +@testset "XMLTokenizer" begin + +#-----------------------------------------------------------------------# Basic text +@testset "plain text" begin + toks = collect(tokenize("hello world")) + @test length(toks) == 1 + @test toks[1].kind == TOKEN_TEXT + @test toks[1].raw == "hello world" +end + +@testset "empty string" begin + @test isempty(collect(tokenize(""))) +end + +#-----------------------------------------------------------------------# Open tags +@testset "open tag without attributes" begin + @test kinds("
") == [TOKEN_OPEN_TAG, TOKEN_TAG_CLOSE] + @test raws("
") == [""] +end + +@testset "open tag with attributes" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TOKEN_OPEN_TAG, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_TAG_CLOSE, + ] + @test tag_name(toks[1]) == "a" + @test toks[2].raw == "href" + @test attr_value(toks[3]) == "url" + @test toks[4].raw == "class" + @test attr_value(toks[5]) == "main" +end + +@testset "whitespace around =" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_TAG_CLOSE, + ] + @test attr_value(toks[3]) == "1" +end + +#-----------------------------------------------------------------------# Self-closing tags +@testset "self-closing tag" begin + @test kinds("
") == [TOKEN_OPEN_TAG, TOKEN_SELF_CLOSE] + @test raws("
") == [""] +end + +@testset "self-closing tag with attributes" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_SELF_CLOSE, + ] + @test tag_name(toks[1]) == "img" + @test attr_value(toks[3]) == "a.png" +end + +#-----------------------------------------------------------------------# Close tags +@testset "close tag" begin + toks = collect(tokenize("
")) + @test [t.kind for t in toks] == [TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE] + @test tag_name(toks[1]) == "div" + @test toks[2].raw == ">" +end + +@testset "close tag with whitespace" begin + toks = collect(tokenize("
")) + @test [t.kind for t in toks] == [TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE] + @test tag_name(toks[1]) == "div" +end + +#-----------------------------------------------------------------------# Open + close round-trip +@testset "element with text" begin + xml = "

hello

" + @test kinds(xml) == [ + TOKEN_OPEN_TAG, TOKEN_TAG_CLOSE, + TOKEN_TEXT, + TOKEN_CLOSE_TAG, TOKEN_TAG_CLOSE, + ] + toks = collect(tokenize(xml)) + @test tag_name(toks[1]) == "p" + @test toks[3].raw == "hello" + @test tag_name(toks[4]) == "p" +end + +#-----------------------------------------------------------------------# Namespaced tags +@testset "namespaced tag" begin + xml = """""" + toks = collect(tokenize(xml)) + @test tag_name(toks[1]) == "ns:el" + @test toks[2].raw == "xmlns:ns" +end + +#-----------------------------------------------------------------------# Comments +@testset "comment" begin + xml = "" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE] + @test toks[1].raw == "" +end + +@testset "empty comment" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TOKEN_COMMENT_OPEN, TOKEN_COMMENT_CONTENT, TOKEN_COMMENT_CLOSE] + @test toks[2].raw == "" +end + +@testset "comment with markup-like content" begin + toks = collect(tokenize("")) + @test toks[2].raw == " not a tag " +end + +#-----------------------------------------------------------------------# CDATA +@testset "CDATA" begin + xml = "]]>" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE] + @test toks[1].raw == "" + @test toks[3].raw == "]]>" +end + +@testset "empty CDATA" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TOKEN_CDATA_OPEN, TOKEN_CDATA_CONTENT, TOKEN_CDATA_CLOSE] + @test toks[2].raw == "" +end + +#-----------------------------------------------------------------------# Processing instructions +@testset "processing instruction" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE] + @test toks[1].raw == "" +end + +@testset "PI with no content" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TOKEN_PI_OPEN, TOKEN_PI_CONTENT, TOKEN_PI_CLOSE] + @test pi_target(toks[1]) == "target" + @test toks[2].raw == "" +end + +#-----------------------------------------------------------------------# XML declaration +@testset "XML declaration" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TOKEN_XML_DECL_OPEN, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_XML_DECL_CLOSE, + ] + @test pi_target(toks[1]) == "xml" + @test toks[1].raw == "" +end + +@testset "XML declaration with single quotes" begin + xml = "" + toks = collect(tokenize(xml)) + @test toks[3].raw == "'1.0'" + @test attr_value(toks[3]) == "1.0" +end + +#-----------------------------------------------------------------------# DOCTYPE +@testset "DOCTYPE simple" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE] + @test toks[1].raw == "" +end + +@testset "DOCTYPE with internal subset" begin + xml = """]>""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE] + @test toks[2].raw == " note []" +end + +@testset "DOCTYPE with quoted > in internal subset" begin + xml = """b">]>""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TOKEN_DOCTYPE_OPEN, TOKEN_DOCTYPE_CONTENT, TOKEN_DOCTYPE_CLOSE] + @test occursin("a>b", toks[2].raw) +end + +#-----------------------------------------------------------------------# Full document +@testset "full document" begin + xml = """ + + + text + + + + +""" + toks = collect(tokenize(xml)) + tok_kinds = [t.kind for t in toks] + + # XML declaration + @test tok_kinds[1] == TOKEN_XML_DECL_OPEN + # DOCTYPE present + @test TOKEN_DOCTYPE_OPEN in tok_kinds + # All open tags have matching closes + open_names = [tag_name(t) for t in toks if t.kind == TOKEN_OPEN_TAG] + close_names = [tag_name(t) for t in toks if t.kind == TOKEN_CLOSE_TAG] + @test open_names == ["root", "child", "empty"] + @test close_names == ["child", "root"] + # CDATA is present + cdata_content = [t.raw for t in toks if t.kind == TOKEN_CDATA_CONTENT] + @test cdata_content == ["data"] + # Comment is present + comment_content = [t.raw for t in toks if t.kind == TOKEN_COMMENT_CONTENT] + @test comment_content == [" comment "] + # PI is present + pi_opens = [t for t in toks if t.kind == TOKEN_PI_OPEN] + @test length(pi_opens) == 1 + @test pi_target(pi_opens[1]) == "pi" +end + +#-----------------------------------------------------------------------# Raw round-trip +@testset "concatenated raw reproduces input" begin + # Round-trip works for inputs where no whitespace/= is consumed between tokens. + # Whitespace around `=` in attributes is consumed and not part of any token. + for xml in [ + """
""", + """""", + """""", + """]>""", + """

text

""", + ] + reconstructed = join(t.raw for t in tokenize(xml)) + @test reconstructed == xml + end +end + +@testset "attribute whitespace is not preserved" begin + # Whitespace around `=` and between attrs is consumed, not emitted as tokens. + xml = """
""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TOKEN_OPEN_TAG, TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, + TOKEN_ATTR_NAME, TOKEN_ATTR_VALUE, TOKEN_SELF_CLOSE, + ] +end + +#-----------------------------------------------------------------------# Iterator protocol +@testset "iterator protocol" begin + t = tokenize("") + @test Base.IteratorSize(typeof(t)) == Base.SizeUnknown() + @test Base.eltype(typeof(t)) == Token + toks = collect(t) + @test length(toks) == 2 +end + +#-----------------------------------------------------------------------# Utility error handling +@testset "tag_name errors on wrong kind" begin + tok = first(tokenize("hello")) + @test_throws ArgumentError tag_name(tok) +end + +@testset "attr_value errors on wrong kind" begin + tok = first(tokenize("")) + @test_throws ArgumentError attr_value(tok) +end + +@testset "pi_target errors on wrong kind" begin + tok = first(tokenize("")) + @test_throws ArgumentError pi_target(tok) +end + +#-----------------------------------------------------------------------# Error cases +@testset "error: unterminated comment" begin + @test_throws ArgumentError collect(tokenize("")) + @test toks[2].raw == " héllo " +end + +#-----------------------------------------------------------------------# Edge cases +@testset "adjacent tags" begin + xml = "" + toks = collect(tokenize(xml)) + open_names = [tag_name(t) for t in toks if t.kind == TOKEN_OPEN_TAG] + close_names = [tag_name(t) for t in toks if t.kind == TOKEN_CLOSE_TAG] + @test open_names == ["a", "b"] + @test close_names == ["a", "b"] + # No text tokens between them + @test !any(t -> t.kind == TOKEN_TEXT, toks) +end + +@testset "text between adjacent tags" begin + xml = "xy" + texts = [t.raw for t in tokenize(xml) if t.kind == TOKEN_TEXT] + @test texts == ["x", "y"] +end + +@testset "multiple attributes" begin + xml = """
""" + names = [String(t.raw) for t in tokenize(xml) if t.kind == TOKEN_ATTR_NAME] + vals = [String(attr_value(t)) for t in tokenize(xml) if t.kind == TOKEN_ATTR_VALUE] + @test names == ["a", "b", "c"] + @test vals == ["1", "2", "3"] +end + +@testset "attribute with > in value" begin + xml = """""" + toks = collect(tokenize(xml)) + @test attr_value(toks[3]) == "1>2" + @test toks[end].kind == TOKEN_TAG_CLOSE +end + +@testset "attribute with single quotes" begin + xml = "" + toks = collect(tokenize(xml)) + @test toks[3].raw == "'val'" + @test attr_value(toks[3]) == "val" +end + +@testset "mixed quote styles" begin + xml = """""" + vals = [attr_value(t) for t in tokenize(xml) if t.kind == TOKEN_ATTR_VALUE] + @test vals == ["1", "2"] +end + +@testset "whitespace-only text" begin + xml = " \n\t " + texts = [t for t in tokenize(xml) if t.kind == TOKEN_TEXT] + @test length(texts) == 1 + @test texts[1].raw == " \n\t " +end + +@testset "entities preserved verbatim" begin + xml = "

& < A

" + texts = [t.raw for t in tokenize(xml) if t.kind == TOKEN_TEXT] + @test texts == ["& < A"] +end + +@testset "show method" begin + tok = first(tokenize("hello")) + buf = IOBuffer() + show(buf, tok) + s = String(take!(buf)) + @test occursin("TOKEN_TEXT", s) + @test occursin("hello", s) +end + +end # top-level testset diff --git a/test/test_w3c.jl b/test/test_w3c.jl new file mode 100644 index 0000000..16587ed --- /dev/null +++ b/test/test_w3c.jl @@ -0,0 +1,154 @@ +# W3C XML Conformance Test Suite +# https://www.w3.org/XML/Test/xmlts20130923.tar +# +# Test types: +# - "valid": well-formed XML that is also valid (should parse successfully) +# - "invalid": well-formed but not valid per DTD (should still parse — we're non-validating) +# - "not-wf": not well-formed XML (should fail to parse) +# - "error": optional errors (parser may or may not reject) +# +# We only run tests with ENTITIES="none" since XML.jl does not expand external entities. +# We skip XML 1.1 tests (VERSION="1.1" or RECOMMENDATION="XML1.1"). + +using XML +using XML: Node, nodetype, Document +using Test +using Downloads: download +using Tar + +const W3C_URL = "https://www.w3.org/XML/Test/xmlts20130923.tar" +const W3C_DIR = joinpath(@__DIR__, "data", "w3c") +const W3C_TAR = joinpath(@__DIR__, "data", "xmlts20130923.tar") + +function ensure_w3c_suite() + isdir(joinpath(W3C_DIR, "xmlconf")) && return + mkpath(W3C_DIR) + if !isfile(W3C_TAR) + @info "Downloading W3C XML Conformance Test Suite..." + download(W3C_URL, W3C_TAR) + end + @info "Extracting W3C XML Conformance Test Suite..." + open(W3C_TAR) do io + Tar.extract(io, W3C_DIR) + end +end + +# Parse a test catalog XML and extract TEST entries +function parse_catalog(catalog_path::String) + isfile(catalog_path) || return NamedTuple[] + doc = read(catalog_path, Node) + tests = NamedTuple[] + _collect_tests!(tests, doc, dirname(catalog_path)) + return tests +end + +function _collect_tests!(tests, node, base_dir) + for child in XML.children(node) + nodetype(child) !== XML.Element && continue + if XML.tag(child) == "TEST" + attrs = XML.attributes(child) + haskey(attrs, "URI") || continue + push!(tests, ( + type = get(attrs, "TYPE", ""), + entities = get(attrs, "ENTITIES", ""), + id = get(attrs, "ID", ""), + uri = joinpath(base_dir, attrs["URI"]), + version = get(attrs, "VERSION", "1.0"), + recommendation = get(attrs, "RECOMMENDATION", ""), + )) + elseif XML.tag(child) == "TESTCASES" + # TESTCASES may have xml:base to adjust paths + sub_base = get(XML.attributes(child), "xml:base", "") + child_base = isempty(sub_base) ? base_dir : joinpath(base_dir, sub_base) + _collect_tests!(tests, child, child_base) + else + _collect_tests!(tests, child, base_dir) + end + end +end + +function is_xml11(test) + test.version == "1.1" || + test.recommendation == "XML1.1" || + contains(test.recommendation, "XML1.1") +end + +ensure_w3c_suite() + +# Catalogs for XML 1.0 tests +const XMLCONF_DIR = joinpath(W3C_DIR, "xmlconf") +const CATALOGS = filter(isfile, [ + joinpath(XMLCONF_DIR, "xmltest", "xmltest.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-valid.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-invalid.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-not-wf.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-error.xml"), + joinpath(XMLCONF_DIR, "oasis", "oasis.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_not-wf.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_valid.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_invalid.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-2e", "errata2e.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-3e", "errata3e.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-4e", "errata4e.xml"), + joinpath(XMLCONF_DIR, "eduni", "namespaces", "1.0", "rmt-ns10.xml"), + joinpath(XMLCONF_DIR, "eduni", "misc", "ht-bh.xml"), + joinpath(XMLCONF_DIR, "japanese", "japanese.xml"), +]) + +# Collect all tests +all_tests = NamedTuple[] +for catalog in CATALOGS + append!(all_tests, parse_catalog(catalog)) +end + +# Filter: only ENTITIES="none", skip XML 1.1 +xml10_tests = filter(t -> t.entities == "none" && !is_xml11(t), all_tests) + +valid_tests = filter(t -> t.type in ("valid", "invalid"), xml10_tests) +notwf_tests = filter(t -> t.type == "not-wf", xml10_tests) + +@info "W3C tests: $(length(valid_tests)) valid/invalid, $(length(notwf_tests)) not-wf (from $(length(all_tests)) total)" + +@testset "W3C Conformance" begin + @testset "Well-formed documents should parse" begin + n_pass = 0 + n_fail = 0 + failures = String[] + for test in valid_tests + isfile(test.uri) || continue + try + doc = read(test.uri, Node) + @test nodetype(doc) == Document + n_pass += 1 + catch e + n_fail += 1 + push!(failures, "$(test.id): $e") + end + end + if n_fail > 0 + @warn "W3C well-formed: $n_pass passed, $n_fail failed" failures=first(failures, 20) + end + @info "W3C well-formed: $n_pass / $(n_pass + n_fail) passed" + end + + @testset "Not-well-formed documents should fail to parse" begin + n_pass = 0 + n_fail = 0 + failures = String[] + for test in notwf_tests + isfile(test.uri) || continue + try + read(test.uri, Node) + n_fail += 1 + push!(failures, test.id) + catch + @test true + n_pass += 1 + end + end + if n_fail > 0 + @warn "W3C not-well-formed: $n_pass rejected, $n_fail incorrectly accepted" failures=first(failures, 20) + end + @info "W3C not-well-formed: $n_pass / $(n_pass + n_fail) correctly rejected" + end +end