diff --git a/go.mod b/go.mod index b3ddc09c..59bac923 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,8 @@ require ( cloud.google.com/go/storage v1.59.2 github.com/agext/regexp v1.3.0 github.com/andybalholm/cascadia v1.3.3 + github.com/antchfx/xmlquery v1.5.1 + github.com/antchfx/xpath v1.3.6 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/service/s3 v1.96.0 github.com/aws/smithy-go v1.24.0 @@ -18,7 +20,6 @@ require ( github.com/kettek/apng v0.0.0-20220823221153-ff692776a607 github.com/pdfcpu/pdfcpu v0.11.1 github.com/pkg/errors v0.9.1 - github.com/readium/xmlquery v0.0.0-20230106230237-8f493145aef4 github.com/relvacode/iso8601 v1.7.0 github.com/stretchr/testify v1.11.1 github.com/trimmer-io/go-xmp v1.0.0 @@ -41,7 +42,6 @@ require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.54.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.54.0 // indirect - github.com/antchfx/xpath v1.3.6 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect diff --git a/go.sum b/go.sum index e1fa4f8c..0a908d03 100644 --- a/go.sum +++ b/go.sum @@ -51,7 +51,8 @@ github.com/agext/regexp v1.3.0 h1:6+9tp+S41TU48gFNV47bX+pp1q7WahGofw6JccmsCDs= github.com/agext/regexp v1.3.0/go.mod h1:6phv1gViOJXWcTfpxOi9VMS+MaSAo+SUDf7do3ur1HA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= -github.com/antchfx/xpath v1.2.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xmlquery v1.5.1 h1:T9I4Ns1EXiWHy0IqKupGhnfTQtJwlGrpXtauYOoNv78= +github.com/antchfx/xmlquery v1.5.1/go.mod h1:bVqnl7TaDXSReKINrhZz+2E/PbCu2tUahb+wZ7WZNT8= github.com/antchfx/xpath v1.3.6 h1:s0y+ElRRtTQdfHP609qFu0+c6bglDv20pqOViQjjdPI= github.com/antchfx/xpath v1.3.6/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU= @@ -127,6 +128,7 @@ github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfU github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= @@ -197,8 +199,6 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/readium/xmlquery v0.0.0-20230106230237-8f493145aef4 h1:iEQhT4jOppg7EK/r4/1e4ULIeCsugv35O+sDlvce5Bo= -github.com/readium/xmlquery v0.0.0-20230106230237-8f493145aef4/go.mod h1:S7gZ8KUgPbsdlF9/iomcwnU31iHMyFEO66+JFJE8uz8= github.com/relvacode/iso8601 v1.7.0 h1:BXy+V60stMP6cpswc+a93Mq3e65PfXCgDFfhvNNGrdo= github.com/relvacode/iso8601 v1.7.0/go.mod h1:FlNp+jz+TXpyRqgmM7tnzHHzBnz776kmAH2h3sZCn0I= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -301,7 +301,6 @@ golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= @@ -347,7 +346,6 @@ golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/pkg/fetcher/resource.go b/pkg/fetcher/resource.go index 58f59d9b..d75f948b 100644 --- a/pkg/fetcher/resource.go +++ b/pkg/fetcher/resource.go @@ -1,6 +1,7 @@ package fetcher import ( + "bytes" "context" "encoding/json" "encoding/xml" @@ -9,11 +10,10 @@ import ( "io" "net/http" "os" - "strings" + "github.com/antchfx/xmlquery" "github.com/readium/go-toolkit/pkg/archive" "github.com/readium/go-toolkit/pkg/manifest" - "github.com/readium/xmlquery" "golang.org/x/text/encoding" "golang.org/x/text/encoding/unicode" ) @@ -92,13 +92,12 @@ func ReadResourceAsJSON(ctx context.Context, r Resource) (map[string]interface{} return object, nil } -func ReadResourceAsXML(ctx context.Context, r Resource, prefixes map[string]string) (*xmlquery.Node, *ResourceError) { - bytes, ex := r.Read(ctx, 0, 0) +func ReadResourceAsXML(ctx context.Context, r Resource) (*xmlquery.Node, *ResourceError) { + bin, ex := r.Read(ctx, 0, 0) if ex != nil { return nil, ex } - node, err := xmlquery.ParseWithOptions(strings.NewReader(string(bytes)), xmlquery.ParserOptions{ - Prefixes: prefixes, + node, err := xmlquery.ParseWithOptions(bytes.NewReader(bin), xmlquery.ParserOptions{ Decoder: &xmlquery.DecoderOptions{ Strict: true, Entity: xml.HTMLEntity, diff --git a/pkg/parser/epub/media_overlay_service.go b/pkg/parser/epub/media_overlay_service.go index 857e02c9..a10266d8 100644 --- a/pkg/parser/epub/media_overlay_service.go +++ b/pkg/parser/epub/media_overlay_service.go @@ -85,11 +85,7 @@ func (s *MediaOverlayService) GuideForResource(ctx context.Context, href string) res := s.fetcher.Get(ctx, link) defer res.Close() - n, rerr := fetcher.ReadResourceAsXML(ctx, res, map[string]string{ - NamespaceOPS: "epub", - NamespaceSMIL: "smil", - NamespaceSMIL2: "smil2", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, res) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/metadata.go b/pkg/parser/epub/metadata.go index d2590fef..f0f9ea64 100644 --- a/pkg/parser/epub/metadata.go +++ b/pkg/parser/epub/metadata.go @@ -6,11 +6,16 @@ import ( "strings" "time" + "github.com/antchfx/xmlquery" "github.com/readium/go-toolkit/pkg/internal/extensions" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/mediatype" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpMetaLanguage = mustCompileNS("//opf:metadata/dc:language") + xpMetadata = mustCompileNS("//opf:metadata") ) type Title struct { @@ -52,22 +57,18 @@ func NewMetadataParser(epubVersion float64, prefixMap map[string]string) Metadat func (m MetadataParser) Parse(document *xmlquery.Node, filePath url.URL) *EPUBMetadata { // Init lang - if l := document.SelectElement("/" + NSSelect(NamespaceOPF, "package")); l != nil { + if l := xmlquery.QuerySelector(document, xpPackage); l != nil { for _, attr := range l.Attr { if attr.Name.Local == "lang" { m.packageLanguage = attr.Value } } } - if l := document.SelectElement( - "//" + NSSelect(NamespaceOPF, "metadata") + "/" + NSSelect(NamespaceDC, "language"), - ); l != nil { + if l := xmlquery.QuerySelector(document, xpMetaLanguage); l != nil { m.metaLanguage = strings.TrimSpace(l.InnerText()) } - metadata := document.SelectElement( - "//" + NSSelect(NamespaceOPF, "metadata"), - ) + metadata := xmlquery.QuerySelector(document, xpMetadata) if metadata == nil { return nil } @@ -1091,7 +1092,7 @@ func (m *PubMetadataAdapter) OtherMetadata() map[string]interface{} { if _, ok := usedProperties[k]; ok { continue } - values := make([]interface{}, len(v)) + values := make([]any, len(v)) for i, val := range v { values[i] = val.ToMap() } diff --git a/pkg/parser/epub/metadata_test.go b/pkg/parser/epub/metadata_test.go index fefc6506..9999c04f 100644 --- a/pkg/parser/epub/metadata_test.go +++ b/pkg/parser/epub/metadata_test.go @@ -14,12 +14,7 @@ import ( ) func loadMetadata(ctx context.Context, name string) (*manifest.Metadata, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/package/"+name+".opf"), map[string]string{ - NamespaceOPF: "opf", - NamespaceDC: "dc", - VocabularyDCTerms: "dcterms", - "http://www.idpf.org/2013/rendition": "rendition", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/package/"+name+".opf")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/parser.go b/pkg/parser/epub/parser.go index 848d311e..93e5f668 100644 --- a/pkg/parser/epub/parser.go +++ b/pkg/parser/epub/parser.go @@ -41,12 +41,7 @@ func (p Parser) Parse(ctx context.Context, asset asset.PublicationAsset, f fetch // Detect DRM - opfXmlDocument, errx := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(opfPath)}), map[string]string{ - NamespaceOPF: "opf", - NamespaceDC: "dc", - VocabularyDCTerms: "dcterms", - "http://www.idpf.org/2013/rendition": "rendition", - }) + opfXmlDocument, errx := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(opfPath)})) if errx != nil { return nil, errx } @@ -80,11 +75,7 @@ func (p Parser) Parse(ctx context.Context, asset asset.PublicationAsset, f fetch } func parseEncryptionData(ctx context.Context, f fetcher.Fetcher) (ret map[url.URL]manifest.Encryption) { - n, err := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/encryption.xml", false)}), map[string]string{ - NamespaceENC: "enc", - NamespaceSIG: "ds", - NamespaceCOMP: "comp", - }) + n, err := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/encryption.xml", false)})) if err != nil { return } @@ -113,9 +104,7 @@ func parseNavigationData(ctx context.Context, packageDocument PackageDocument, f if ncxItem == nil { return } - n, nerr := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(ncxItem.Href)}), map[string]string{ - NamespaceNCX: "ncx", - }) + n, nerr := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(ncxItem.Href)})) if nerr != nil { return } @@ -136,10 +125,7 @@ func parseNavigationData(ctx context.Context, packageDocument PackageDocument, f if navItem == nil { return } - n, errx := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(navItem.Href)}), map[string]string{ - NamespaceXHTML: "html", - NamespaceOPS: "epub", - }) + n, errx := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.NewHREF(navItem.Href)})) if errx != nil { return } @@ -150,9 +136,9 @@ func parseNavigationData(ctx context.Context, packageDocument PackageDocument, f func parseDisplayOptions(ctx context.Context, f fetcher.Fetcher) (ret map[string]string) { ret = make(map[string]string) - displayOptionsXml, err := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/com.apple.ibooks.display-options.xml", false)}), nil) + displayOptionsXml, err := fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/com.apple.ibooks.display-options.xml", false)})) if err != nil { - displayOptionsXml, err = fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/com.kobobooks.display-options.xml", false)}), nil) + displayOptionsXml, err = fetcher.ReadResourceAsXML(ctx, f.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/com.kobobooks.display-options.xml", false)})) if err != nil { return } diff --git a/pkg/parser/epub/parser_encryption.go b/pkg/parser/epub/parser_encryption.go index 6587cbeb..9d74a148 100644 --- a/pkg/parser/epub/parser_encryption.go +++ b/pkg/parser/epub/parser_encryption.go @@ -3,14 +3,26 @@ package epub import ( "strconv" + "github.com/antchfx/xmlquery" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/protection" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpEncEncData = mustCompileNS("//enc:EncryptedData") + xpEncCipherData = mustCompileNS("enc:CipherData") + xpEncCipherRef = mustCompileNS("enc:CipherReference") + xpEncKeyInfo = mustCompileNS("ds:KeyInfo") + xpEncRetrieval = mustCompileNS("ds:RetrievalMethod") + xpEncMethod = mustCompileNS("enc:EncryptionMethod") + xpEncProps = mustCompileNS("enc:EncryptionProperties") + xpEncProp = mustCompileNS("enc:EncryptionProperty") + xpEncCompress = mustCompileNS("comp:Compression") ) func ParseEncryption(document *xmlquery.Node) (ret map[url.URL]manifest.Encryption) { - for _, node := range document.SelectElements("//" + NSSelect(NamespaceENC, "EncryptedData")) { + for _, node := range xmlquery.QuerySelectorAll(document, xpEncEncData) { u, e := parseEncryptedData(node) if e != nil { if ret == nil { @@ -23,19 +35,19 @@ func ParseEncryption(document *xmlquery.Node) (ret map[url.URL]manifest.Encrypti } func parseEncryptedData(node *xmlquery.Node) (url.URL, *manifest.Encryption) { - cdat := node.SelectElement(NSSelect(NamespaceENC, "CipherData")) + cdat := xmlquery.QuerySelector(node, xpEncCipherData) if cdat == nil { return nil, nil } - cipherref := cdat.SelectElement(NSSelect(NamespaceENC, "CipherReference")) + cipherref := xmlquery.QuerySelector(cdat, xpEncCipherRef) if cipherref == nil { return nil, nil } resourceURI := cipherref.SelectAttr("URI") retrievalMethod := "" - if keyinfo := node.SelectElement(NSSelect(NamespaceSIG, "KeyInfo")); keyinfo != nil { - if r := keyinfo.SelectElement(NSSelect(NamespaceSIG, "RetrievalMethod")); r != nil { + if keyinfo := xmlquery.QuerySelector(node, xpEncKeyInfo); keyinfo != nil { + if r := xmlquery.QuerySelector(keyinfo, xpEncRetrieval); r != nil { retrievalMethod = r.SelectAttr("URI") } } @@ -48,11 +60,11 @@ func parseEncryptedData(node *xmlquery.Node) (url.URL, *manifest.Encryption) { ret.Scheme = protection.SchemeLCP } - if encryptionmethod := node.SelectElement(NSSelect(NamespaceENC, "EncryptionMethod")); encryptionmethod != nil { + if encryptionmethod := xmlquery.QuerySelector(node, xpEncMethod); encryptionmethod != nil { ret.Algorithm = encryptionmethod.SelectAttr("Algorithm") } - if encryptionproperties := node.SelectElement(NSSelect(NamespaceENC, "EncryptionProperties")); encryptionproperties != nil { + if encryptionproperties := xmlquery.QuerySelector(node, xpEncProps); encryptionproperties != nil { originalLength, method := parseEncryptionProperties(encryptionproperties) if method != "" { ret.Compression = method @@ -69,8 +81,8 @@ func parseEncryptedData(node *xmlquery.Node) (url.URL, *manifest.Encryption) { } func parseEncryptionProperties(encryptionProperties *xmlquery.Node) (int64, string) { - for _, encryptionProperty := range encryptionProperties.SelectElements(NSSelect(NamespaceENC, "EncryptionProperty")) { - if compressionElement := encryptionProperty.SelectElement(NSSelect(NamespaceCOMP, "Compression")); compressionElement != nil { + for _, encryptionProperty := range xmlquery.QuerySelectorAll(encryptionProperties, xpEncProp) { + if compressionElement := xmlquery.QuerySelector(encryptionProperty, xpEncCompress); compressionElement != nil { if originalLength, method := parseCompressionElement(compressionElement); method != "" { return originalLength, method } diff --git a/pkg/parser/epub/parser_encryption_test.go b/pkg/parser/epub/parser_encryption_test.go index cf902485..6b32dd52 100644 --- a/pkg/parser/epub/parser_encryption_test.go +++ b/pkg/parser/epub/parser_encryption_test.go @@ -12,11 +12,7 @@ import ( ) func loadEncryption(ctx context.Context, name string) (map[string]manifest.Encryption, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/encryption/encryption-"+name+".xml"), map[string]string{ - NamespaceENC: "enc", - NamespaceSIG: "ds", - NamespaceCOMP: "comp", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/encryption/encryption-"+name+".xml")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/parser_navdoc.go b/pkg/parser/epub/parser_navdoc.go index 21d9bc5e..a2c74339 100644 --- a/pkg/parser/epub/parser_navdoc.go +++ b/pkg/parser/epub/parser_navdoc.go @@ -3,9 +3,16 @@ package epub import ( "strings" + "github.com/antchfx/xmlquery" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpNavBody = mustCompileNS("//html:body") + xpNavNav = mustCompileNS("//html:nav") + xpNavOL = mustCompileNS("html:ol") + xpNavLI = mustCompileNS("html:li") ) func ParseNavDoc(document *xmlquery.Node, filePath url.URL) map[string]manifest.LinkList { @@ -17,12 +24,12 @@ func ParseNavDoc(document *xmlquery.Node, filePath url.URL) map[string]manifest. } } - body := document.SelectElement("//" + NSSelect(NamespaceXHTML, "body")) + body := xmlquery.QuerySelector(document, xpNavBody) if body == nil { return ret } - for _, nav := range body.SelectElements("//" + NSSelect(NamespaceXHTML, "nav")) { + for _, nav := range xmlquery.QuerySelectorAll(body, xpNavNav) { types, links := parseNavElement(nav, filePath, docPrefixes) if types == nil && links == nil { continue @@ -53,7 +60,7 @@ func parseNavElement(nav *xmlquery.Node, filePath url.URL, prefixMap map[string] types = append(types, resolveProperty(prop, prefixMap, DefaultVocabType)) } - links := parseOlElement(nav.SelectElement(NSSelect(NamespaceXHTML, "ol")), filePath) + links := parseOlElement(xmlquery.QuerySelector(nav, xpNavOL), filePath) if len(links) > 0 && len(types) > 0 { return types, links } @@ -64,9 +71,9 @@ func parseOlElement(ol *xmlquery.Node, filePath url.URL) manifest.LinkList { if ol == nil { return nil } - ols := ol.SelectElements(NSSelect(NamespaceXHTML, "li")) - links := make(manifest.LinkList, 0, len(ols)) - for _, li := range ol.SelectElements(NSSelect(NamespaceXHTML, "li")) { + lis := xmlquery.QuerySelectorAll(ol, xpNavLI) + links := make(manifest.LinkList, 0, len(lis)) + for _, li := range lis { l := parseLiElement(li, filePath) if l != nil { links = append(links, *l) @@ -96,7 +103,7 @@ func parseLiElement(li *xmlquery.Node, filePath url.URL) (link *manifest.Link) { } } - children := parseOlElement(li.SelectElement(NSSelect(NamespaceXHTML, "ol")), filePath) + children := parseOlElement(xmlquery.QuerySelector(li, xpNavOL), filePath) if len(children) == 0 && (href.String() == "" || title == "") { return nil } diff --git a/pkg/parser/epub/parser_navdoc_test.go b/pkg/parser/epub/parser_navdoc_test.go index 38987d3a..a84b7ee8 100644 --- a/pkg/parser/epub/parser_navdoc_test.go +++ b/pkg/parser/epub/parser_navdoc_test.go @@ -12,10 +12,7 @@ import ( ) func loadNavDoc(ctx context.Context, name string) (map[string]manifest.LinkList, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/navdoc/"+name+".xhtml"), map[string]string{ - NamespaceXHTML: "html", - NamespaceOPS: "epub", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/navdoc/"+name+".xhtml")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/parser_ncx.go b/pkg/parser/epub/parser_ncx.go index b21ac03d..d4b63e73 100644 --- a/pkg/parser/epub/parser_ncx.go +++ b/pkg/parser/epub/parser_ncx.go @@ -3,14 +3,23 @@ package epub import ( "strings" + "github.com/antchfx/xmlquery" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpNCXNavMap = mustCompileNS("//ncx:navMap") + xpNCXPageList = mustCompileNS("//ncx:pageList") + xpNCXNavPoint = mustCompileNS("ncx:navPoint") + xpNCXPageTgt = mustCompileNS("ncx:pageTarget") + xpNCXContent = mustCompileNS("ncx:content") + xpNCXNavLblTxt = mustCompileNS("ncx:navLabel/ncx:text") ) func ParseNCX(document *xmlquery.Node, filePath url.URL) map[string]manifest.LinkList { - toc := document.SelectElement("//" + NSSelect(NamespaceNCX, "navMap")) - pageList := document.SelectElement("//" + NSSelect(NamespaceNCX, "pageList")) + toc := xmlquery.QuerySelector(document, xpNCXNavMap) + pageList := xmlquery.QuerySelector(document, xpNCXPageList) ret := make(map[string]manifest.LinkList) if toc != nil { @@ -31,7 +40,7 @@ func ParseNCX(document *xmlquery.Node, filePath url.URL) map[string]manifest.Lin func parseNavMapElement(element *xmlquery.Node, filePath url.URL) manifest.LinkList { var links manifest.LinkList - for _, el := range element.SelectElements(NSSelect(NamespaceNCX, "navPoint")) { + for _, el := range xmlquery.QuerySelectorAll(element, xpNCXNavPoint) { if p := parseNavPointElement(el, filePath); p != nil { links = append(links, *p) } @@ -40,7 +49,7 @@ func parseNavMapElement(element *xmlquery.Node, filePath url.URL) manifest.LinkL } func parsePageListElement(element *xmlquery.Node, filePath url.URL) manifest.LinkList { - selectedElements := element.SelectElements(NSSelect(NamespaceNCX, "pageTarget")) + selectedElements := xmlquery.QuerySelectorAll(element, xpNCXPageTgt) links := make([]manifest.Link, 0, len(selectedElements)) for _, el := range selectedElements { href := extractHref(el, filePath) @@ -60,7 +69,7 @@ func parseNavPointElement(element *xmlquery.Node, filePath url.URL) *manifest.Li title := extractTitle(element) href := extractHref(element, filePath) var children manifest.LinkList - for _, el := range element.SelectElements(NSSelect(NamespaceNCX, "navPoint")) { + for _, el := range xmlquery.QuerySelectorAll(element, xpNCXNavPoint) { if p := parseNavPointElement(el, filePath); p != nil { children = append(children, *p) } @@ -79,7 +88,7 @@ func parseNavPointElement(element *xmlquery.Node, filePath url.URL) *manifest.Li } func extractTitle(element *xmlquery.Node) string { - tel := element.SelectElement(NSSelect(NamespaceNCX, "navLabel") + "/" + NSSelect(NamespaceNCX, "text")) + tel := xmlquery.QuerySelector(element, xpNCXNavLblTxt) if tel == nil { return "" } @@ -87,7 +96,7 @@ func extractTitle(element *xmlquery.Node) string { } func extractHref(element *xmlquery.Node, filePath url.URL) url.URL { - el := element.SelectElement(NSSelect(NamespaceNCX, "content")) + el := xmlquery.QuerySelector(element, xpNCXContent) if el == nil { return nil } diff --git a/pkg/parser/epub/parser_ncx_test.go b/pkg/parser/epub/parser_ncx_test.go index 64ad9e9f..f021fe61 100644 --- a/pkg/parser/epub/parser_ncx_test.go +++ b/pkg/parser/epub/parser_ncx_test.go @@ -12,9 +12,7 @@ import ( ) func loadNcx(ctx context.Context, name string) (map[string]manifest.LinkList, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/ncx/"+name+".ncx"), map[string]string{ - NamespaceNCX: "ncx", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/ncx/"+name+".ncx")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/parser_packagedoc.go b/pkg/parser/epub/parser_packagedoc.go index cf1b3f35..6c6414a5 100644 --- a/pkg/parser/epub/parser_packagedoc.go +++ b/pkg/parser/epub/parser_packagedoc.go @@ -3,11 +3,19 @@ package epub import ( "strconv" + "github.com/antchfx/xmlquery" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/mediatype" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpPackage = mustCompileNS("/opf:package") + xpManifest = mustCompileNS("/opf:manifest") + xpSpine = mustCompileNS("/opf:spine") + xpItems = mustCompileNS("/opf:item") + xpItemRefs = mustCompileNS("/opf:itemref") ) type PackageDocument struct { @@ -21,7 +29,7 @@ type PackageDocument struct { } func ParsePackageDocument(document *xmlquery.Node, filePath url.URL) (*PackageDocument, error) { - pkg := document.SelectElement("/" + NSSelect(NamespaceOPF, "package")) + pkg := xmlquery.QuerySelector(document, xpPackage) if pkg == nil { return nil, errors.New("package root element not found") } @@ -48,16 +56,16 @@ func ParsePackageDocument(document *xmlquery.Node, filePath url.URL) (*PackageDo if metadata == nil { return nil, errors.New("failed parsing package metadata") } - manifestElement := pkg.SelectElement("/" + NSSelect(NamespaceOPF, "manifest")) + manifestElement := xmlquery.QuerySelector(pkg, xpManifest) if manifestElement == nil { return nil, errors.New("package manifest not found") } - spineElement := pkg.SelectElement("/" + NSSelect(NamespaceOPF, "spine")) + spineElement := xmlquery.QuerySelector(pkg, xpSpine) if spineElement == nil { return nil, errors.New("package spine not found") } - mels := manifestElement.SelectElements("/" + NSSelect(NamespaceOPF, "item")) + mels := xmlquery.QuerySelectorAll(manifestElement, xpItems) manifest := make([]Item, 0, len(mels)) for _, mel := range mels { item := ParseItem(mel, filePath, prefixMap) @@ -126,9 +134,7 @@ type Spine struct { } func ParseSpine(element *xmlquery.Node, prefixMap map[string]string, epubVersion float64) Spine { - selectedElements := element.SelectElements( - "/" + NSSelect(NamespaceOPF, "itemref"), - ) + selectedElements := xmlquery.QuerySelectorAll(element, xpItemRefs) itemrefs := make([]ItemRef, 0, len(selectedElements)) for _, itemref := range selectedElements { itemref := ParseItemRef(itemref, prefixMap) diff --git a/pkg/parser/epub/parser_packagedoc_test.go b/pkg/parser/epub/parser_packagedoc_test.go index 661e1c89..ce84a460 100644 --- a/pkg/parser/epub/parser_packagedoc_test.go +++ b/pkg/parser/epub/parser_packagedoc_test.go @@ -13,12 +13,7 @@ import ( ) func loadPackageDoc(ctx context.Context, name string) (*manifest.Manifest, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/package/"+name+".opf"), map[string]string{ - NamespaceOPF: "opf", - NamespaceDC: "dc", - VocabularyDCTerms: "dcterms", - "http://www.idpf.org/2013/rendition": "rendition", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/package/"+name+".opf")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/parser_smil.go b/pkg/parser/epub/parser_smil.go index 1a38aca3..f573af06 100644 --- a/pkg/parser/epub/parser_smil.go +++ b/pkg/parser/epub/parser_smil.go @@ -3,21 +3,29 @@ package epub import ( "strconv" + "github.com/antchfx/xmlquery" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" +) + +var ( + xpSMILRoot = mustCompileNS("/smil:smil | /smil2:smil") + xpSMILBody = mustCompileNS("smil:body | smil2:body") + xpSMILParOrSeq = mustCompileNS("smil:par | smil:seq | smil2:par | smil2:seq") + xpSMILTextChild = mustCompileNS("smil:text | smil2:text") + xpSMILAudio = mustCompileNS("smil:audio | smil2:audio") ) func ParseSMILDocument(document *xmlquery.Node, filePath url.URL) (*manifest.GuidedNavigationDocument, error) { - smil := document.SelectElement("/" + DualNSSelect(NamespaceSMIL, NamespaceSMIL2, "smil")) + smil := xmlquery.QuerySelector(document, xpSMILRoot) if smil == nil { return nil, errors.New("SMIL root element not found") } // Ignore the , we don't need it with the current implementation - body := smil.SelectElement(DualNSSelect(NamespaceSMIL, NamespaceSMIL2, "body")) + body := xmlquery.QuerySelector(smil, xpSMILBody) if body == nil { return nil, errors.New("SMIL body not found") } @@ -32,7 +40,7 @@ func ParseSMILDocument(document *xmlquery.Node, filePath url.URL) (*manifest.Gui } func ParseSMILSeq(seq *xmlquery.Node, filePath url.URL) ([]manifest.GuidedNavigationObject, error) { - childElements := seq.SelectElements(ManyNSSelectMany([]string{NamespaceSMIL, NamespaceSMIL2}, []string{"par", "seq"})) + childElements := xmlquery.QuerySelectorAll(seq, xpSMILParOrSeq) if len(childElements) == 0 && seq.Data == "body" { return nil, errors.New("SMIL body is empty") } @@ -84,7 +92,7 @@ func ParseSMILSeq(seq *xmlquery.Node, filePath url.URL) ([]manifest.GuidedNaviga } func ParseSMILPar(par *xmlquery.Node, filePath url.URL) (*manifest.GuidedNavigationObject, error) { - text := par.SelectElement(DualNSSelect(NamespaceSMIL, NamespaceSMIL2, "text")) + text := xmlquery.QuerySelector(par, xpSMILTextChild) if text == nil { return nil, errors.New("SMIL par has no text element") } @@ -101,7 +109,7 @@ func ParseSMILPar(par *xmlquery.Node, filePath url.URL) (*manifest.GuidedNavigat o.TextRef = filePath.Resolve(u).String() // Audio is optional - if audio := par.SelectElement(DualNSSelect(NamespaceSMIL, NamespaceSMIL2, "audio")); audio != nil { + if audio := xmlquery.QuerySelector(par, xpSMILAudio); audio != nil { o.AudioRef = audio.SelectAttr("src") if o.AudioRef == "" { return nil, errors.New("SMIL par audio element has empty src attribute") diff --git a/pkg/parser/epub/parser_smil_test.go b/pkg/parser/epub/parser_smil_test.go index dbfe8e0d..9bb42786 100644 --- a/pkg/parser/epub/parser_smil_test.go +++ b/pkg/parser/epub/parser_smil_test.go @@ -12,11 +12,7 @@ import ( ) func loadSmil(ctx context.Context, name string) (*manifest.GuidedNavigationDocument, error) { - n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/smil/"+name+".smil"), map[string]string{ - NamespaceOPS: "epub", - NamespaceSMIL: "smil", - NamespaceSMIL2: "smil2", - }) + n, rerr := fetcher.ReadResourceAsXML(ctx, fetcher.NewFileResource(manifest.Link{}, "./testdata/smil/"+name+".smil")) if rerr != nil { return nil, rerr.Cause } diff --git a/pkg/parser/epub/utils.go b/pkg/parser/epub/utils.go index 9d6b453c..08215b59 100644 --- a/pkg/parser/epub/utils.go +++ b/pkg/parser/epub/utils.go @@ -3,26 +3,52 @@ package epub import ( "context" "strconv" - "strings" + "github.com/antchfx/xmlquery" + "github.com/antchfx/xpath" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/fetcher" ftchr "github.com/readium/go-toolkit/pkg/fetcher" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/util/url" - "github.com/readium/xmlquery" ) +// xmlNS maps the XML namespace prefixes used by precompiled XPath expressions +// throughout this package to their canonical namespace URIs. Update this map +// when introducing queries against a new namespace. +var xmlNS = map[string]string{ + "opf": NamespaceOPF, + "dc": NamespaceDC, + "rendition": "http://www.idpf.org/2013/rendition", + "enc": NamespaceENC, + "ds": NamespaceSIG, + "comp": NamespaceCOMP, + "ncx": NamespaceNCX, + "html": NamespaceXHTML, + "epub": NamespaceOPS, + "smil": NamespaceSMIL, + "smil2": NamespaceSMIL2, + "ocf": NamespaceOPC, +} + +var xpRootfile = mustCompileNS("/ocf:container/ocf:rootfiles/ocf:rootfile") + +func mustCompileNS(expr string) *xpath.Expr { + e, err := xpath.CompileWithNS(expr, xmlNS) + if err != nil { + panic("epub: invalid xpath " + expr + ": " + err.Error()) + } + return e +} + func GetRootFilePath(ctx context.Context, fetcher fetcher.Fetcher) (url.URL, error) { res := fetcher.Get(ctx, manifest.Link{Href: manifest.MustNewHREFFromString("META-INF/container.xml", false)}) - xml, err := ftchr.ReadResourceAsXML(ctx, res, map[string]string{ - "urn:oasis:names:tc:opendocument:xmlns:container": "cn", - }) + xml, err := ftchr.ReadResourceAsXML(ctx, res) if err != nil { return nil, errors.Wrap(err, "failed loading container.xml") } - n := xml.SelectElement("/container/rootfiles/rootfile") + n := xmlquery.QuerySelector(xml, xpRootfile) if n == nil { return nil, errors.New("rootfile not found in container") } @@ -38,46 +64,6 @@ func GetRootFilePath(ctx context.Context, fetcher fetcher.Fetcher) (url.URL, err return u, nil } -// TODO: Use updated xpath/xmlquery functions -func NSSelect(namespace, localName string) string { - return "*[namespace-uri()='" + namespace + "' and local-name()='" + localName + "']" -} - -// TODO: Use updated xpath/xmlquery functions -func DualNSSelect(namespace1, namespace2, localName string) string { - return "*[(namespace-uri()='" + namespace1 + "' or namespace-uri()='" + namespace2 + "') and local-name()='" + localName + "']" -} - -// TODO: Use updated xpath/xmlquery functions -func ManyNSSelectMany(namespaces []string, localNames []string) string { - if len(namespaces) == 0 || len(localNames) == 0 { - panic("namespaces and localNames must not be empty") - } - - var sb strings.Builder - sb.WriteString("*[(") - for i, ns := range namespaces { - if i > 0 { - sb.WriteString(" or ") - } - sb.WriteString("namespace-uri()='") - sb.WriteString(ns) - sb.WriteString("'") - } - sb.WriteString(") and (") - for i, ln := range localNames { - if i > 0 { - sb.WriteString(" or ") - } - sb.WriteString("local-name()='") - sb.WriteString(ln) - sb.WriteString("'") - } - sb.WriteString(")]") - - return sb.String() -} - func SelectNodeAttrNs(n *xmlquery.Node, ns, name string) string { for _, a := range n.Attr { if a.NamespaceURI == ns && a.Name.Local == name {