Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 85 additions & 16 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
Expand All @@ -56,7 +57,9 @@
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -93,6 +96,55 @@ public class DatasetParser extends AbstractParser {
private DatasetContextClassifier datasetContextClassifier;
private DatasetDisambiguator disambiguator;

// Cached JAXP factories to avoid ServiceLoader churn and classloader-cache
// accumulation across TEI/XML requests. Factories are not thread-safe for
// new*() calls, hence the synchronized helpers below. The factory is also
// hardened against XXE/SSRF since it parses user-supplied XML/TEI.
private static final DocumentBuilderFactory DOC_BUILDER_FACTORY;
private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();

static {
DOC_BUILDER_FACTORY = DocumentBuilderFactory.newInstance();
DOC_BUILDER_FACTORY.setNamespaceAware(true);
hardenDocumentBuilderFactory(DOC_BUILDER_FACTORY);
}

/**
* Apply a conservative XXE/SSRF hardening to a shared DocumentBuilderFactory.
* Each feature is set in its own try/catch so that unsupported features on a
* given JAXP implementation do not prevent class initialisation.
*/
private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) {
trySetFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true);
trySetFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
trySetFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
trySetFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
try {
factory.setXIncludeAware(false);
} catch (UnsupportedOperationException | AbstractMethodError ignore) {
// older JAXP impls
}
factory.setExpandEntityReferences(false);
}

private static void trySetFeature(DocumentBuilderFactory factory, String feature, boolean value) {
try {
factory.setFeature(feature, value);
} catch (ParserConfigurationException e) {
LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature);
}
}
Comment thread
lfoppiano marked this conversation as resolved.

private static synchronized DocumentBuilder newDocumentBuilder()
throws ParserConfigurationException {
return DOC_BUILDER_FACTORY.newDocumentBuilder();
}

private static synchronized XPath newXPath() {
return XPATH_FACTORY.newXPath();
}

private static void warnGluttonNotConfiguredOnce() {
if (gluttonWarningLogged.compareAndSet(false, true)) {
LOGGER.warn("Glutton host not configured, bibliographical reference consolidation will be skipped");
Expand Down Expand Up @@ -1532,14 +1584,15 @@ public List<List<Dataset>> markDAS(List<List<Dataset>> entities, List<LayoutToke

public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
org.w3c.dom.Document document = null;
try {
String tei = processXML(file);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
DocumentBuilder builder = newDocumentBuilder();
//tei = avoidDomParserAttributeBug(tei);

org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei)));
try (StringReader reader = new StringReader(tei)) {
document = builder.parse(new InputSource(reader));
}
//document.getDocumentElement().normalize();

// TODO: call pub2TEI with sentence segmentation
Expand All @@ -1549,17 +1602,29 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean
} catch (final Exception exp) {
LOGGER.error("An error occured while processing the following XML file: "
+ file.getPath(), exp);
} finally {
// Release the DOM tree eagerly so the (large) node graph is collectible
// as soon as this request returns.
document = null;
}
return resultExtraction;
}

public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
org.w3c.dom.Document document = null;
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
org.w3c.dom.Document document = builder.parse(file);
DocumentBuilder builder = newDocumentBuilder();
// Parse via an explicitly managed InputStream so the file handle is
// released deterministically (DocumentBuilder.parse(File) defers stream
// closure to Xerces internals, which accumulates FDs under load).
// Set systemId so relative references (DTD/entities/XInclude) and
// error locations behave as they did with parse(File).
try (InputStream is = new FileInputStream(file)) {
InputSource inputSource = new InputSource(is);
inputSource.setSystemId(file.toURI().toString());
document = builder.parse(inputSource);
}
org.w3c.dom.Element root = document.getDocumentElement();
boolean hasSegmentation = hasTEISentenceSegmentation(root);

Expand All @@ -1568,11 +1633,15 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
}

resultExtraction = processTEIDocument(document, disambiguate);
//tei = restoreDomParserAttributeBug(tei);
//tei = restoreDomParserAttributeBug(tei);

} catch (final Exception exp) {
LOGGER.error("An error occured while processing the following XML file: "
+ file.getPath(), exp);
} finally {
// Release the DOM tree eagerly so the (large) node graph is collectible
// as soon as this request returns.
document = null;
}

return resultExtraction;
Expand All @@ -1596,8 +1665,6 @@ public String processXML(File file) throws Exception {
this.datastetConfiguration.getDatastetConfiguration().getPub2TEIPath());
//System.out.println(newFilePath);

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
tei = FileUtils.readFileToString(new File(newFilePath), UTF_8);

} catch (final Exception exp) {
Expand All @@ -1621,11 +1688,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
boolean disambiguate) {

Pair<List<List<Dataset>>, List<BibDataSet>> tei = null;
org.w3c.dom.Document document = null;
try (StringReader reader = new StringReader(documentAsString);){
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
org.w3c.dom.Document document = builder.parse(new InputSource(reader));
DocumentBuilder builder = newDocumentBuilder();
document = builder.parse(new InputSource(reader));
//document.getDocumentElement().normalize();
org.w3c.dom.Element root = document.getDocumentElement();

Expand All @@ -1638,6 +1704,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
tei = processTEIDocument(document, disambiguate);
} catch (ParserConfigurationException | IOException | SAXException e) {
e.printStackTrace();
} finally {
// Release the DOM tree eagerly.
document = null;
}
return tei;

Expand All @@ -1656,7 +1725,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
//Extract relevant section from the TEI
// Title, abstract, keywords

XPath xPath = XPathFactory.newInstance().newXPath();
XPath xPath = newXPath();

try {
org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate(
Expand Down
Loading
Loading