From eb832fb844f358596a76906678344b53f1eabc97 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 16 Apr 2026 04:46:43 +0000
Subject: [PATCH 1/2] Fix memory leak in TEI processing: cache JAXP factories,
 close streams

PR #7 fixed a leak in the PDF path by closing DocumentSource; TEI has no
DocumentSource, but heap and file-descriptor usage still grew under
sustained /processDatasetTEI load. Root causes:

1. JAXP factory churn. DocumentBuilderFactory.newInstance(),
   XPathFactory.newInstance(), TransformerFactory.newInstance(), and
   SAXParserFactory.newInstance() ran on every request (and, in
   XMLUtilities.segment(), on every sentence). Each call re-runs
   ServiceLoader discovery and produces factories whose classloader-
   backed caches are not reclaimed promptly.
2. DocumentBuilder.parse(File) defers FileInputStream closure to
   Xerces, accumulating FDs under sustained load.
3. The parsed DOM Document was left as a local reference until
   method return, delaying young-gen reclaim of a large node graph.

Fix:
- Cache factories as private static finals in DatasetParser and
  XMLUtilities, with synchronized accessors (newDocumentBuilder,
  newXPath, newSAXParser, newTransformer). Factories' new*() methods
  are not guaranteed thread-safe; synchronized access keeps contention
  negligible versus XML parse cost and avoids ThreadLocal leaks in
  the Dropwizard thread pool.
- Parse TEI via try-with-resources FileInputStream so the handle is
  released deterministically.
- Null the parsed Document reference in finally blocks to aid GC.
- Remove dead DocumentBuilderFactory allocation in processXML(File).

No PDF-path changes; PR #7's DocumentSource.close() is preserved.
---
 .../grobid/core/engines/DatasetParser.java    | 67 +++++++++++++----
 .../grobid/core/utilities/XMLUtilities.java   | 73 ++++++++++++-------
 2 files changed, 97 insertions(+), 43 deletions(-)
diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
index 59f5333..251056f 100644
--- a/src/main/java/org/grobid/core/engines/DatasetParser.java
+++ b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -56,7 +56,9 @@
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringReader;
 import java.util.*;
 import java.util.stream.Collectors;
@@ -93,6 +95,26 @@ public class DatasetParser extends AbstractParser {
     private DatasetContextClassifier datasetContextClassifier;
     private DatasetDisambiguator disambiguator;
 
+    // Cached JAXP factories to avoid ServiceLoader churn and classloader-cache
+    // accumulation across TEI/XML requests. Factories are not thread-safe for
+    // new*() calls, hence the synchronized helpers below.
+    private static final DocumentBuilderFactory DOC_BUILDER_FACTORY;
+    private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
+
+    static {
+        DOC_BUILDER_FACTORY = DocumentBuilderFactory.newInstance();
+        DOC_BUILDER_FACTORY.setNamespaceAware(true);
+    }
+
+    private static synchronized DocumentBuilder newDocumentBuilder()
+            throws ParserConfigurationException {
+        return DOC_BUILDER_FACTORY.newDocumentBuilder();
+    }
+
+    private static synchronized XPath newXPath() {
+        return XPATH_FACTORY.newXPath();
+    }
+
     private static void warnGluttonNotConfiguredOnce() {
         if (gluttonWarningLogged.compareAndSet(false, true)) {
             LOGGER.warn("Glutton host not configured, bibliographical reference consolidation will be skipped");
@@ -1532,14 +1554,15 @@ public List<List<Dataset>> markDAS(List<List<Dataset>> entities, List<LayoutToke
 
     public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
+        org.w3c.dom.Document document = null;
         try {
             String tei = processXML(file);
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
+            DocumentBuilder builder = newDocumentBuilder();
             //tei = avoidDomParserAttributeBug(tei);
 
-            org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei)));
+            try (StringReader reader = new StringReader(tei)) {
+                document = builder.parse(new InputSource(reader));
+            }
             //document.getDocumentElement().normalize();
 
             // TODO: call pub2TEI with sentence segmentation
@@ -1549,17 +1572,25 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the following XML file: "
                     + file.getPath(), exp);
+        } finally {
+            // Release the DOM tree eagerly so the (large) node graph is collectible
+            // as soon as this request returns.
+            document = null;
         }
         return resultExtraction;
     }
 
     public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
+        org.w3c.dom.Document document = null;
         try {
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            org.w3c.dom.Document document = builder.parse(file);
+            DocumentBuilder builder = newDocumentBuilder();
+            // Parse via an explicitly managed InputStream so the file handle is
+            // released deterministically (DocumentBuilder.parse(File) defers stream
+            // closure to Xerces internals, which accumulates FDs under load).
+            try (InputStream is = new FileInputStream(file)) {
+                document = builder.parse(new InputSource(is));
+            }
             org.w3c.dom.Element root = document.getDocumentElement();
             boolean hasSegmentation = hasTEISentenceSegmentation(root);
 
@@ -1568,11 +1599,15 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
             }
 
             resultExtraction = processTEIDocument(document, disambiguate);
-            //tei = restoreDomParserAttributeBug(tei); 
+            //tei = restoreDomParserAttributeBug(tei);
 
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the following XML file: "
                     + file.getPath(), exp);
+        } finally {
+            // Release the DOM tree eagerly so the (large) node graph is collectible
+            // as soon as this request returns.
+            document = null;
         }
 
         return resultExtraction;
@@ -1596,8 +1631,6 @@ public String processXML(File file) throws Exception {
                     this.datastetConfiguration.getDatastetConfiguration().getPub2TEIPath());
             //System.out.println(newFilePath);
 
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
             tei = FileUtils.readFileToString(new File(newFilePath), UTF_8);
 
         } catch (final Exception exp) {
@@ -1621,11 +1654,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
                                                                           boolean disambiguate) {
 
         Pair<List<List<Dataset>>, List<BibDataSet>> tei = null;
+        org.w3c.dom.Document document = null;
         try (StringReader reader = new StringReader(documentAsString);){
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            org.w3c.dom.Document document = builder.parse(new InputSource(reader));
+            DocumentBuilder builder = newDocumentBuilder();
+            document = builder.parse(new InputSource(reader));
             //document.getDocumentElement().normalize();
             org.w3c.dom.Element root = document.getDocumentElement();
 
@@ -1638,6 +1670,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
             tei = processTEIDocument(document, disambiguate);
         } catch (ParserConfigurationException | IOException | SAXException e) {
             e.printStackTrace();
+        } finally {
+            // Release the DOM tree eagerly.
+            document = null;
         }
         return tei;
 
@@ -1656,7 +1691,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
         //Extract relevant section from the TEI
         // Title, abstract, keywords
 
-        XPath xPath = XPathFactory.newInstance().newXPath();
+        XPath xPath = newXPath();
 
         try {
             org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate(
diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
index 634a231..5055fdd 100644
--- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java
+++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -14,11 +14,14 @@
 import org.w3c.dom.Text;
 import org.xml.sax.InputSource;
 
+import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
@@ -44,16 +47,46 @@ public class XMLUtilities {
     public static final String URL_TYPE = "url";
     private static final String URI_TYPE = "uri";
 
+    // Cached JAXP factories. JAXP factories are not guaranteed thread-safe for
+    // their new*() methods; synchronized accessors below produce per-call
+    // builders/parsers/transformers that are themselves single-thread only.
+    // Caching avoids repeated ServiceLoader discovery which, under sustained
+    // TEI load, left classloader-backed references accumulating on the heap.
+    private static final DocumentBuilderFactory DBF;
+    private static final SAXParserFactory SPF = SAXParserFactory.newInstance();
+    private static final XPathFactory XPF = XPathFactory.newInstance();
+    private static final TransformerFactory TF = TransformerFactory.newInstance();
+
+    static {
+        DBF = DocumentBuilderFactory.newInstance();
+        DBF.setNamespaceAware(true);
+    }
+
+    private static synchronized DocumentBuilder newBuilder() throws ParserConfigurationException {
+        return DBF.newDocumentBuilder();
+    }
+
+    private static synchronized SAXParser newSAXParser() throws Exception {
+        return SPF.newSAXParser();
+    }
+
+    private static synchronized XPath newXPath() {
+        return XPF.newXPath();
+    }
+
+    private static synchronized Transformer newTransformer() throws TransformerConfigurationException {
+        return TF.newTransformer();
+    }
+
     public static String toPrettyString(String xml, int indent) {
         try (ByteArrayInputStream inputStream = new ByteArrayInputStream(xml.getBytes("utf-8"))) {
             // Turn xml string into a document
-            org.w3c.dom.Document document = DocumentBuilderFactory.newInstance()
-                    .newDocumentBuilder()
+            org.w3c.dom.Document document = newBuilder()
                     .parse(new InputSource(inputStream));
 
             // Remove whitespaces outside tags
             document.normalize();
-            XPath xPath = XPathFactory.newInstance().newXPath();
+            XPath xPath = newXPath();
             org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//text()[normalize-space()='']",
                     document,
                     XPathConstants.NODESET);
@@ -64,8 +97,7 @@ public static String toPrettyString(String xml, int indent) {
             }
 
             // Setup pretty print options
-            TransformerFactory transformerFactory = TransformerFactory.newInstance();
-            Transformer transformer = transformerFactory.newTransformer();
+            Transformer transformer = newTransformer();
             transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
             transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
             transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
@@ -117,8 +149,7 @@ public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.do
         BiblStructSaxHandler handler = new BiblStructSaxHandler();
         String teiXML = null;
         try {
-            SAXParserFactory spf = SAXParserFactory.newInstance();
-            SAXParser p = spf.newSAXParser();
+            SAXParser p = newSAXParser();
             teiXML = serialize(doc, biblStructElement);
             try (StringReader reader = new StringReader(teiXML)) {
                 p.parse(new InputSource(reader), handler);
@@ -267,9 +298,8 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
         try {
             Object evalContext = (node != null) ? node : doc;
             if (evalContext != null) {
-                XPathFactory xpathFactory = XPathFactory.newInstance();
                 // XPath to find empty text nodes.
-                XPathExpression xpathExp = xpathFactory.newXPath().compile(
+                XPathExpression xpathExp = newXPath().compile(
                         "//text()[normalize-space(.) = '']");
                 NodeList emptyTextNodes = (NodeList)
                         xpathExp.evaluate(evalContext, XPathConstants.NODESET);
@@ -293,8 +323,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
 
         try (StringWriter writer = new StringWriter()) {
             StreamResult result = new StreamResult(writer);
-            TransformerFactory tf = TransformerFactory.newInstance();
-            Transformer transformer = tf.newTransformer();
+            Transformer transformer = newTransformer();
             transformer.setOutputProperty(OutputKeys.METHOD, "xml");
             transformer.setOutputProperty(OutputKeys.INDENT, "yes");
             transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
@@ -321,13 +350,11 @@ public static void cleanXMLCorpus(String documentPath) throws Exception {
         File outputFile = new File(documentPath.replace(".tei.xml", ".clean.tei.xml"));
 
         // we use a DOM parser
-        org.w3c.dom.Document document = DocumentBuilderFactory.newInstance()
-                .newDocumentBuilder()
-                .parse(documentFile);
+        org.w3c.dom.Document document = newBuilder().parse(documentFile);
 
         // remove tei entries with empty body
         document.normalize();
-        XPath xPath = XPathFactory.newInstance().newXPath();
+        XPath xPath = newXPath();
         org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//tei/text/body",
                 document,
                 XPathConstants.NODESET);
@@ -375,8 +402,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception {
         }
 
         // Setup pretty print options
-        TransformerFactory transformerFactory = TransformerFactory.newInstance();
-        Transformer transformer = transformerFactory.newTransformer();
+        Transformer transformer = newTransformer();
         transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
         transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
         transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
@@ -390,9 +416,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception {
 
             // check again if everything is well-formed after the changes
             try (ByteArrayInputStream inputStream = new ByteArrayInputStream(stringWriter.toString().getBytes(StandardCharsets.UTF_8))) {
-                document = DocumentBuilderFactory.newInstance()
-                        .newDocumentBuilder()
-                        .parse(new InputSource(inputStream));
+                document = newBuilder().parse(new InputSource(inputStream));
             } catch (Exception e) {
                 System.out.println("Problem with the final TEI XML");
                 e.printStackTrace();
@@ -511,10 +535,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     String fullSent = "<s>" + newSent + "</s>";
                     boolean fail = false;
                     try (StringReader reader = new StringReader(fullSent)) {
-                        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-                        factory.setNamespaceAware(true);
-
-                        org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(reader));
+                        org.w3c.dom.Document d = newBuilder().parse(new InputSource(reader));
                     } catch (Exception e) {
                         fail = true;
                     }
@@ -540,9 +561,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     //System.out.println(sent);  
 
                     try (StringReader reader = new StringReader(sent)) {
-                        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-                        factory.setNamespaceAware(true);
-                        org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(reader));
+                        org.w3c.dom.Document d = newBuilder().parse(new InputSource(reader));
                         //d.getDocumentElement().normalize();
                         Node newNode = doc.importNode(d.getDocumentElement(), true);
                         newNodes.add(newNode);

From c04555d0042f466271f5a02fa6853db18eab7612 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 16 Apr 2026 05:14:23 +0000
Subject: [PATCH 2/2] Harden cached JAXP factories and preserve systemId on TEI
 parse

Addresses Copilot review feedback on the previous commit:

1. XXE/SSRF hardening. Because the cached DocumentBuilderFactory /
   SAXParserFactory / TransformerFactory parse user-supplied XML/TEI,
   the static initializers now apply OWASP-recommended hardening:
   - FEATURE_SECURE_PROCESSING
   - disallow-doctype-decl
   - external-general-entities = false
   - external-parameter-entities = false
   - nonvalidating/load-external-dtd = false
   - setXIncludeAware(false), setExpandEntityReferences(false)
   - TransformerFactory: ACCESS_EXTERNAL_DTD / ACCESS_EXTERNAL_STYLESHEET
     pinned to empty string
   Each feature/attribute is set in its own try/catch so unsupported
   options on a given JAXP implementation do not break class init.

2. Preserve systemId. DocumentBuilder.parse(File) previously set the
   document systemId to the file URI, which matters for relative
   references and error locations. Restore it by setting
   inputSource.setSystemId(file.toURI().toString()) on the InputSource
   wrapper in DatasetParser.processTEI while keeping the try-with-
   resources FileInputStream for deterministic stream closure.
---
 .../grobid/core/engines/DatasetParser.java    | 38 +++++++++-
 .../grobid/core/utilities/XMLUtilities.java   | 71 ++++++++++++++++++-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
index 251056f..b04e5ad 100644
--- a/src/main/java/org/grobid/core/engines/DatasetParser.java
+++ b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -48,6 +48,7 @@
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
+import javax.xml.XMLConstants;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -97,13 +98,42 @@ public class DatasetParser extends AbstractParser {
 
     // Cached JAXP factories to avoid ServiceLoader churn and classloader-cache
     // accumulation across TEI/XML requests. Factories are not thread-safe for
-    // new*() calls, hence the synchronized helpers below.
+    // new*() calls, hence the synchronized helpers below. The factory is also
+    // hardened against XXE/SSRF since it parses user-supplied XML/TEI.
     private static final DocumentBuilderFactory DOC_BUILDER_FACTORY;
     private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
 
     static {
         DOC_BUILDER_FACTORY = DocumentBuilderFactory.newInstance();
         DOC_BUILDER_FACTORY.setNamespaceAware(true);
+        hardenDocumentBuilderFactory(DOC_BUILDER_FACTORY);
+    }
+
+    /**
+     * Apply a conservative XXE/SSRF hardening to a shared DocumentBuilderFactory.
+     * Each feature is set in its own try/catch so that unsupported features on a
+     * given JAXP implementation do not prevent class initialisation.
+     */
+    private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) {
+        trySetFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        trySetFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true);
+        trySetFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
+        trySetFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
+        trySetFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+        try {
+            factory.setXIncludeAware(false);
+        } catch (UnsupportedOperationException | AbstractMethodError ignore) {
+            // older JAXP impls
+        }
+        factory.setExpandEntityReferences(false);
+    }
+
+    private static void trySetFeature(DocumentBuilderFactory factory, String feature, boolean value) {
+        try {
+            factory.setFeature(feature, value);
+        } catch (ParserConfigurationException e) {
+            LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature);
+        }
     }
 
     private static synchronized DocumentBuilder newDocumentBuilder()
@@ -1588,8 +1618,12 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
             // Parse via an explicitly managed InputStream so the file handle is
             // released deterministically (DocumentBuilder.parse(File) defers stream
             // closure to Xerces internals, which accumulates FDs under load).
+            // Set systemId so relative references (DTD/entities/XInclude) and
+            // error locations behave as they did with parse(File).
             try (InputStream is = new FileInputStream(file)) {
-                document = builder.parse(new InputSource(is));
+                InputSource inputSource = new InputSource(is);
+                inputSource.setSystemId(file.toURI().toString());
+                document = builder.parse(inputSource);
             }
             org.w3c.dom.Element root = document.getDocumentElement();
             boolean hasSegmentation = hasTEISentenceSegmentation(root);
diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
index 5055fdd..2f96875 100644
--- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java
+++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -14,6 +14,7 @@
 import org.w3c.dom.Text;
 import org.xml.sax.InputSource;
 
+import javax.xml.XMLConstants;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -52,14 +53,80 @@ public class XMLUtilities {
     // builders/parsers/transformers that are themselves single-thread only.
     // Caching avoids repeated ServiceLoader discovery which, under sustained
     // TEI load, left classloader-backed references accumulating on the heap.
+    // Each factory is also hardened against XXE/SSRF since callers parse
+    // user-supplied XML/TEI; features are set defensively so unsupported
+    // options on a given JAXP implementation do not break class init.
     private static final DocumentBuilderFactory DBF;
-    private static final SAXParserFactory SPF = SAXParserFactory.newInstance();
+    private static final SAXParserFactory SPF;
     private static final XPathFactory XPF = XPathFactory.newInstance();
-    private static final TransformerFactory TF = TransformerFactory.newInstance();
+    private static final TransformerFactory TF;
 
     static {
         DBF = DocumentBuilderFactory.newInstance();
         DBF.setNamespaceAware(true);
+        hardenDocumentBuilderFactory(DBF);
+
+        SPF = SAXParserFactory.newInstance();
+        hardenSAXParserFactory(SPF);
+
+        TF = TransformerFactory.newInstance();
+        hardenTransformerFactory(TF);
+    }
+
+    private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) {
+        trySetDBFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        trySetDBFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true);
+        trySetDBFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
+        trySetDBFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
+        trySetDBFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+        try {
+            factory.setXIncludeAware(false);
+        } catch (UnsupportedOperationException | AbstractMethodError ignore) {
+            // older JAXP impls
+        }
+        factory.setExpandEntityReferences(false);
+    }
+
+    private static void trySetDBFeature(DocumentBuilderFactory factory, String feature, boolean value) {
+        try {
+            factory.setFeature(feature, value);
+        } catch (ParserConfigurationException e) {
+            LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature);
+        }
+    }
+
+    private static void hardenSAXParserFactory(SAXParserFactory factory) {
+        trySetSPFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        trySetSPFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true);
+        trySetSPFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
+        trySetSPFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
+        trySetSPFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+    }
+
+    private static void trySetSPFeature(SAXParserFactory factory, String feature, boolean value) {
+        try {
+            factory.setFeature(feature, value);
+        } catch (Exception e) {
+            LOGGER.debug("Unsupported SAXParserFactory feature: {}", feature);
+        }
+    }
+
+    private static void hardenTransformerFactory(TransformerFactory factory) {
+        try {
+            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        } catch (TransformerConfigurationException e) {
+            LOGGER.debug("Unsupported TransformerFactory feature: FEATURE_SECURE_PROCESSING");
+        }
+        trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
+        trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+    }
+
+    private static void trySetTFAttribute(TransformerFactory factory, String attribute, Object value) {
+        try {
+            factory.setAttribute(attribute, value);
+        } catch (IllegalArgumentException e) {
+            LOGGER.debug("Unsupported TransformerFactory attribute: {}", attribute);
+        }
     }
 
     private static synchronized DocumentBuilder newBuilder() throws ParserConfigurationException {