ScienciaLAB · lfoppiano · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -48,6 +48,7 @@
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
+import javax.xml.XMLConstants;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -56,7 +57,9 @@
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringReader;
 import java.util.*;
 import java.util.stream.Collectors;
@@ -93,6 +96,55 @@ public class DatasetParser extends AbstractParser {
     private DatasetContextClassifier datasetContextClassifier;
     private DatasetDisambiguator disambiguator;
 
+    // Cached JAXP factories to avoid ServiceLoader churn and classloader-cache
+    // accumulation across TEI/XML requests. Factories are not thread-safe for
+    // new*() calls, hence the synchronized helpers below. The factory is also
+    // hardened against XXE/SSRF since it parses user-supplied XML/TEI.
+    private static final DocumentBuilderFactory DOC_BUILDER_FACTORY;
+    private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
+
+    static {
+        DOC_BUILDER_FACTORY = DocumentBuilderFactory.newInstance();
+        DOC_BUILDER_FACTORY.setNamespaceAware(true);
+        hardenDocumentBuilderFactory(DOC_BUILDER_FACTORY);
+    }
+
+    /**
+     * Apply a conservative XXE/SSRF hardening to a shared DocumentBuilderFactory.
+     * Each feature is set in its own try/catch so that unsupported features on a
+     * given JAXP implementation do not prevent class initialisation.
+     */
+    private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) {
+        trySetFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        trySetFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true);
+        trySetFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
+        trySetFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
+        trySetFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+        try {
+            factory.setXIncludeAware(false);
+        } catch (UnsupportedOperationException | AbstractMethodError ignore) {
+            // older JAXP impls
+        }
+        factory.setExpandEntityReferences(false);
+    }
+
+    private static void trySetFeature(DocumentBuilderFactory factory, String feature, boolean value) {
+        try {
+            factory.setFeature(feature, value);
+        } catch (ParserConfigurationException e) {
+            LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature);
+        }
+    }
+
+    private static synchronized DocumentBuilder newDocumentBuilder()
+            throws ParserConfigurationException {
+        return DOC_BUILDER_FACTORY.newDocumentBuilder();
+    }
+
+    private static synchronized XPath newXPath() {
+        return XPATH_FACTORY.newXPath();
+    }
+
     private static void warnGluttonNotConfiguredOnce() {
         if (gluttonWarningLogged.compareAndSet(false, true)) {
             LOGGER.warn("Glutton host not configured, bibliographical reference consolidation will be skipped");
@@ -1532,14 +1584,15 @@ public List<List<Dataset>> markDAS(List<List<Dataset>> entities, List<LayoutToke
 
     public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
+        org.w3c.dom.Document document = null;
         try {
             String tei = processXML(file);
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
+            DocumentBuilder builder = newDocumentBuilder();
             //tei = avoidDomParserAttributeBug(tei);
 
-            org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei)));
+            try (StringReader reader = new StringReader(tei)) {
+                document = builder.parse(new InputSource(reader));
+            }
             //document.getDocumentElement().normalize();
 
             // TODO: call pub2TEI with sentence segmentation
@@ -1549,17 +1602,29 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the following XML file: "
                     + file.getPath(), exp);
+        } finally {
+            // Release the DOM tree eagerly so the (large) node graph is collectible
+            // as soon as this request returns.
+            document = null;
         }
         return resultExtraction;
     }
 
     public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
+        org.w3c.dom.Document document = null;
         try {
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            org.w3c.dom.Document document = builder.parse(file);
+            DocumentBuilder builder = newDocumentBuilder();
+            // Parse via an explicitly managed InputStream so the file handle is
+            // released deterministically (DocumentBuilder.parse(File) defers stream
+            // closure to Xerces internals, which accumulates FDs under load).
+            // Set systemId so relative references (DTD/entities/XInclude) and
+            // error locations behave as they did with parse(File).
+            try (InputStream is = new FileInputStream(file)) {
+                InputSource inputSource = new InputSource(is);
+                inputSource.setSystemId(file.toURI().toString());
+                document = builder.parse(inputSource);
+            }
             org.w3c.dom.Element root = document.getDocumentElement();
             boolean hasSegmentation = hasTEISentenceSegmentation(root);
 
@@ -1568,11 +1633,15 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
             }
 
             resultExtraction = processTEIDocument(document, disambiguate);
-            //tei = restoreDomParserAttributeBug(tei); 
+            //tei = restoreDomParserAttributeBug(tei);
 
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the following XML file: "
                     + file.getPath(), exp);
+        } finally {
+            // Release the DOM tree eagerly so the (large) node graph is collectible
+            // as soon as this request returns.
+            document = null;
         }
 
         return resultExtraction;
@@ -1596,8 +1665,6 @@ public String processXML(File file) throws Exception {
                     this.datastetConfiguration.getDatastetConfiguration().getPub2TEIPath());
             //System.out.println(newFilePath);
 
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
             tei = FileUtils.readFileToString(new File(newFilePath), UTF_8);
 
         } catch (final Exception exp) {
@@ -1621,11 +1688,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
                                                                           boolean disambiguate) {
 
         Pair<List<List<Dataset>>, List<BibDataSet>> tei = null;
+        org.w3c.dom.Document document = null;
         try (StringReader reader = new StringReader(documentAsString);){
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            org.w3c.dom.Document document = builder.parse(new InputSource(reader));
+            DocumentBuilder builder = newDocumentBuilder();
+            document = builder.parse(new InputSource(reader));
             //document.getDocumentElement().normalize();
             org.w3c.dom.Element root = document.getDocumentElement();
 
@@ -1638,6 +1704,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
             tei = processTEIDocument(document, disambiguate);
         } catch (ParserConfigurationException | IOException | SAXException e) {
             e.printStackTrace();
+        } finally {
+            // Release the DOM tree eagerly.
+            document = null;
         }
         return tei;
 
@@ -1656,7 +1725,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
         //Extract relevant section from the TEI
         // Title, abstract, keywords
 
-        XPath xPath = XPathFactory.newInstance().newXPath();
+        XPath xPath = newXPath();
 
         try {
             org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate(