diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 59f5333..b04e5ad 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -48,6 +48,7 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; @@ -56,7 +57,9 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; @@ -93,6 +96,55 @@ public class DatasetParser extends AbstractParser { private DatasetContextClassifier datasetContextClassifier; private DatasetDisambiguator disambiguator; + // Cached JAXP factories to avoid ServiceLoader churn and classloader-cache + // accumulation across TEI/XML requests. Factories are not thread-safe for + // new*() calls, hence the synchronized helpers below. The factory is also + // hardened against XXE/SSRF since it parses user-supplied XML/TEI. + private static final DocumentBuilderFactory DOC_BUILDER_FACTORY; + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + + static { + DOC_BUILDER_FACTORY = DocumentBuilderFactory.newInstance(); + DOC_BUILDER_FACTORY.setNamespaceAware(true); + hardenDocumentBuilderFactory(DOC_BUILDER_FACTORY); + } + + /** + * Apply a conservative XXE/SSRF hardening to a shared DocumentBuilderFactory. + * Each feature is set in its own try/catch so that unsupported features on a + * given JAXP implementation do not prevent class initialisation. + */ + private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) { + trySetFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true); + trySetFeature(factory, "http://xml.org/sax/features/external-general-entities", false); + trySetFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); + trySetFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + try { + factory.setXIncludeAware(false); + } catch (UnsupportedOperationException | AbstractMethodError ignore) { + // older JAXP impls + } + factory.setExpandEntityReferences(false); + } + + private static void trySetFeature(DocumentBuilderFactory factory, String feature, boolean value) { + try { + factory.setFeature(feature, value); + } catch (ParserConfigurationException e) { + LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature); + } + } + + private static synchronized DocumentBuilder newDocumentBuilder() + throws ParserConfigurationException { + return DOC_BUILDER_FACTORY.newDocumentBuilder(); + } + + private static synchronized XPath newXPath() { + return XPATH_FACTORY.newXPath(); + } + private static void warnGluttonNotConfiguredOnce() { if (gluttonWarningLogged.compareAndSet(false, true)) { LOGGER.warn("Glutton host not configured, bibliographical reference consolidation will be skipped"); @@ -1532,14 +1584,15 @@ public List> markDAS(List> entities, List>, List> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException { Pair>, List> resultExtraction = null; + org.w3c.dom.Document document = null; try { String tei = processXML(file); - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - DocumentBuilder builder = factory.newDocumentBuilder(); + DocumentBuilder builder = newDocumentBuilder(); //tei = avoidDomParserAttributeBug(tei); - org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei))); + try (StringReader reader = new StringReader(tei)) { + document = builder.parse(new InputSource(reader)); + } //document.getDocumentElement().normalize(); // TODO: call pub2TEI with sentence segmentation @@ -1549,17 +1602,29 @@ public Pair>, List> processXML(File file, boolean } catch (final Exception exp) { LOGGER.error("An error occured while processing the following XML file: " + file.getPath(), exp); + } finally { + // Release the DOM tree eagerly so the (large) node graph is collectible + // as soon as this request returns. + document = null; } return resultExtraction; } public Pair>, List> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException { Pair>, List> resultExtraction = null; + org.w3c.dom.Document document = null; try { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - DocumentBuilder builder = factory.newDocumentBuilder(); - org.w3c.dom.Document document = builder.parse(file); + DocumentBuilder builder = newDocumentBuilder(); + // Parse via an explicitly managed InputStream so the file handle is + // released deterministically (DocumentBuilder.parse(File) defers stream + // closure to Xerces internals, which accumulates FDs under load). + // Set systemId so relative references (DTD/entities/XInclude) and + // error locations behave as they did with parse(File). + try (InputStream is = new FileInputStream(file)) { + InputSource inputSource = new InputSource(is); + inputSource.setSystemId(file.toURI().toString()); + document = builder.parse(inputSource); + } org.w3c.dom.Element root = document.getDocumentElement(); boolean hasSegmentation = hasTEISentenceSegmentation(root); @@ -1568,11 +1633,15 @@ public Pair>, List> processTEI(File file, boolean } resultExtraction = processTEIDocument(document, disambiguate); - //tei = restoreDomParserAttributeBug(tei); + //tei = restoreDomParserAttributeBug(tei); } catch (final Exception exp) { LOGGER.error("An error occured while processing the following XML file: " + file.getPath(), exp); + } finally { + // Release the DOM tree eagerly so the (large) node graph is collectible + // as soon as this request returns. + document = null; } return resultExtraction; @@ -1596,8 +1665,6 @@ public String processXML(File file) throws Exception { this.datastetConfiguration.getDatastetConfiguration().getPub2TEIPath()); //System.out.println(newFilePath); - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); tei = FileUtils.readFileToString(new File(newFilePath), UTF_8); } catch (final Exception exp) { @@ -1621,11 +1688,10 @@ public Pair>, List> processTEIDocument(String doc boolean disambiguate) { Pair>, List> tei = null; + org.w3c.dom.Document document = null; try (StringReader reader = new StringReader(documentAsString);){ - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - DocumentBuilder builder = factory.newDocumentBuilder(); - org.w3c.dom.Document document = builder.parse(new InputSource(reader)); + DocumentBuilder builder = newDocumentBuilder(); + document = builder.parse(new InputSource(reader)); //document.getDocumentElement().normalize(); org.w3c.dom.Element root = document.getDocumentElement(); @@ -1638,6 +1704,9 @@ public Pair>, List> processTEIDocument(String doc tei = processTEIDocument(document, disambiguate); } catch (ParserConfigurationException | IOException | SAXException e) { e.printStackTrace(); + } finally { + // Release the DOM tree eagerly. + document = null; } return tei; @@ -1656,7 +1725,7 @@ public Pair>, List> processTEIDocument(org.w3c.do //Extract relevant section from the TEI // Title, abstract, keywords - XPath xPath = XPathFactory.newInstance().newXPath(); + XPath xPath = newXPath(); try { org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate( diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 634a231..2f96875 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -14,11 +14,15 @@ import org.w3c.dom.Text; import org.xml.sax.InputSource; +import javax.xml.XMLConstants; +import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; @@ -44,16 +48,112 @@ public class XMLUtilities { public static final String URL_TYPE = "url"; private static final String URI_TYPE = "uri"; + // Cached JAXP factories. JAXP factories are not guaranteed thread-safe for + // their new*() methods; synchronized accessors below produce per-call + // builders/parsers/transformers that are themselves single-thread only. + // Caching avoids repeated ServiceLoader discovery which, under sustained + // TEI load, left classloader-backed references accumulating on the heap. + // Each factory is also hardened against XXE/SSRF since callers parse + // user-supplied XML/TEI; features are set defensively so unsupported + // options on a given JAXP implementation do not break class init. + private static final DocumentBuilderFactory DBF; + private static final SAXParserFactory SPF; + private static final XPathFactory XPF = XPathFactory.newInstance(); + private static final TransformerFactory TF; + + static { + DBF = DocumentBuilderFactory.newInstance(); + DBF.setNamespaceAware(true); + hardenDocumentBuilderFactory(DBF); + + SPF = SAXParserFactory.newInstance(); + hardenSAXParserFactory(SPF); + + TF = TransformerFactory.newInstance(); + hardenTransformerFactory(TF); + } + + private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) { + trySetDBFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetDBFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true); + trySetDBFeature(factory, "http://xml.org/sax/features/external-general-entities", false); + trySetDBFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); + trySetDBFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + try { + factory.setXIncludeAware(false); + } catch (UnsupportedOperationException | AbstractMethodError ignore) { + // older JAXP impls + } + factory.setExpandEntityReferences(false); + } + + private static void trySetDBFeature(DocumentBuilderFactory factory, String feature, boolean value) { + try { + factory.setFeature(feature, value); + } catch (ParserConfigurationException e) { + LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature); + } + } + + private static void hardenSAXParserFactory(SAXParserFactory factory) { + trySetSPFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetSPFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true); + trySetSPFeature(factory, "http://xml.org/sax/features/external-general-entities", false); + trySetSPFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); + trySetSPFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + } + + private static void trySetSPFeature(SAXParserFactory factory, String feature, boolean value) { + try { + factory.setFeature(feature, value); + } catch (Exception e) { + LOGGER.debug("Unsupported SAXParserFactory feature: {}", feature); + } + } + + private static void hardenTransformerFactory(TransformerFactory factory) { + try { + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + } catch (TransformerConfigurationException e) { + LOGGER.debug("Unsupported TransformerFactory feature: FEATURE_SECURE_PROCESSING"); + } + trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); + trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + } + + private static void trySetTFAttribute(TransformerFactory factory, String attribute, Object value) { + try { + factory.setAttribute(attribute, value); + } catch (IllegalArgumentException e) { + LOGGER.debug("Unsupported TransformerFactory attribute: {}", attribute); + } + } + + private static synchronized DocumentBuilder newBuilder() throws ParserConfigurationException { + return DBF.newDocumentBuilder(); + } + + private static synchronized SAXParser newSAXParser() throws Exception { + return SPF.newSAXParser(); + } + + private static synchronized XPath newXPath() { + return XPF.newXPath(); + } + + private static synchronized Transformer newTransformer() throws TransformerConfigurationException { + return TF.newTransformer(); + } + public static String toPrettyString(String xml, int indent) { try (ByteArrayInputStream inputStream = new ByteArrayInputStream(xml.getBytes("utf-8"))) { // Turn xml string into a document - org.w3c.dom.Document document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() + org.w3c.dom.Document document = newBuilder() .parse(new InputSource(inputStream)); // Remove whitespaces outside tags document.normalize(); - XPath xPath = XPathFactory.newInstance().newXPath(); + XPath xPath = newXPath(); org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//text()[normalize-space()='']", document, XPathConstants.NODESET); @@ -64,8 +164,7 @@ public static String toPrettyString(String xml, int indent) { } // Setup pretty print options - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); @@ -117,8 +216,7 @@ public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.do BiblStructSaxHandler handler = new BiblStructSaxHandler(); String teiXML = null; try { - SAXParserFactory spf = SAXParserFactory.newInstance(); - SAXParser p = spf.newSAXParser(); + SAXParser p = newSAXParser(); teiXML = serialize(doc, biblStructElement); try (StringReader reader = new StringReader(teiXML)) { p.parse(new InputSource(reader), handler); @@ -267,9 +365,8 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { try { Object evalContext = (node != null) ? node : doc; if (evalContext != null) { - XPathFactory xpathFactory = XPathFactory.newInstance(); // XPath to find empty text nodes. - XPathExpression xpathExp = xpathFactory.newXPath().compile( + XPathExpression xpathExp = newXPath().compile( "//text()[normalize-space(.) = '']"); NodeList emptyTextNodes = (NodeList) xpathExp.evaluate(evalContext, XPathConstants.NODESET); @@ -293,8 +390,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { try (StringWriter writer = new StringWriter()) { StreamResult result = new StreamResult(writer); - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer transformer = tf.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, "xml"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); @@ -321,13 +417,11 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { File outputFile = new File(documentPath.replace(".tei.xml", ".clean.tei.xml")); // we use a DOM parser - org.w3c.dom.Document document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() - .parse(documentFile); + org.w3c.dom.Document document = newBuilder().parse(documentFile); // remove tei entries with empty body document.normalize(); - XPath xPath = XPathFactory.newInstance().newXPath(); + XPath xPath = newXPath(); org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//tei/text/body", document, XPathConstants.NODESET); @@ -375,8 +469,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { } // Setup pretty print options - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); @@ -390,9 +483,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { // check again if everything is well-formed after the changes try (ByteArrayInputStream inputStream = new ByteArrayInputStream(stringWriter.toString().getBytes(StandardCharsets.UTF_8))) { - document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() - .parse(new InputSource(inputStream)); + document = newBuilder().parse(new InputSource(inputStream)); } catch (Exception e) { System.out.println("Problem with the final TEI XML"); e.printStackTrace(); @@ -511,10 +602,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { String fullSent = "" + newSent + ""; boolean fail = false; try (StringReader reader = new StringReader(fullSent)) { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - - org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(reader)); + org.w3c.dom.Document d = newBuilder().parse(new InputSource(reader)); } catch (Exception e) { fail = true; } @@ -540,9 +628,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { //System.out.println(sent); try (StringReader reader = new StringReader(sent)) { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(reader)); + org.w3c.dom.Document d = newBuilder().parse(new InputSource(reader)); //d.getDocumentElement().normalize(); Node newNode = doc.importNode(d.getDocumentElement(), true); newNodes.add(newNode);