From b96215f592b1f55ce9a6fec5d659888aba28ecc5 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Thu, 14 Aug 2025 10:46:12 +0200 Subject: [PATCH 1/8] correction parser for the N-Triples format --- src/main/antlr/NTriples.g4 | 7 +- .../parser/ntriples/ANTLRNTriplesParser.java | 64 +++++++++++++++++-- .../io/parser/ntriples/NTriplesListener.java | 38 ++++++++++- 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/src/main/antlr/NTriples.g4 b/src/main/antlr/NTriples.g4 index 783d53ce0..db639ccb2 100644 --- a/src/main/antlr/NTriples.g4 +++ b/src/main/antlr/NTriples.g4 @@ -11,6 +11,11 @@ ntriplesDoc : triple? (EOL* triple)* EOL* ; +directive + : {notifyErrorListeners("Directives are not allowed in N-Triples");} + ('@prefix' | '@base') + ; + triple : subject predicate object '.' ; @@ -92,7 +97,7 @@ PN_CHARS_U // PN_CHARS_BASE | '_' | ':' : PN_CHARS_BASE | '_' -// | ':' + | ':' ; PN_CHARS diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/ANTLRNTriplesParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/ANTLRNTriplesParser.java index 75370cf8f..0caa3c77e 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/ANTLRNTriplesParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/ANTLRNTriplesParser.java @@ -12,8 +12,13 @@ import org.antlr.v4.runtime.CharStreams; import org.antlr.v4.runtime.CommonTokenStream; import org.antlr.v4.runtime.tree.ParseTree; -import org.antlr.v4.runtime.tree.ParseTreeListener; import org.antlr.v4.runtime.tree.ParseTreeWalker; +import org.antlr.v4.runtime.ConsoleErrorListener; +import org.antlr.v4.runtime.misc.ParseCancellationException; +import org.antlr.v4.runtime.BaseErrorListener; +import org.antlr.v4.runtime.RecognitionException; +import org.antlr.v4.runtime.Recognizer; +import org.antlr.v4.runtime.BailErrorStrategy; import java.io.IOException; import java.io.InputStream; @@ -22,7 +27,7 @@ import java.nio.charset.StandardCharsets; /** - * An ANTLR4-based parser for N-Triples format. + * An ANTLR4-based parser for the N-Triples format. * This parser uses an ANTLR grammar to tokenize and parse N-Triples documents, * then a listener to build the RDF model. */ @@ -32,7 +37,7 @@ public class ANTLRNTriplesParser extends AbstractRDFParser { * Constructor for the ANTLRNTriplesParser. * * @param model The RDF model to populate. - * @param factory The ValueFactory for creating RDF resources. + * @param factory The value factory for creating RDF resources. */ public ANTLRNTriplesParser(Model model, ValueFactory factory) { super(model, factory); @@ -42,7 +47,7 @@ public ANTLRNTriplesParser(Model model, ValueFactory factory) { * Constructor for the ANTLRNTriplesParser with configuration options. * * @param model The RDF model to populate. - * @param factory The ValueFactory for creating RDF resources. + * @param factory The value factory for creating RDF resources. * @param config The configuration options for parsing. */ public ANTLRNTriplesParser(Model model, ValueFactory factory, IOOptions config) { @@ -81,22 +86,67 @@ public void parse(Reader reader) throws ParsingErrorException { public void parse(Reader reader, String baseURI) throws ParsingErrorException { try { CharStream charStream = CharStreams.fromReader(reader); + String input = charStream.toString(); + if (input.contains("@prefix")) { + throw new ParsingErrorException("@prefix directives are not allowed in N-Triples"); + } + if (input.contains("@base")) { + throw new ParsingErrorException("@base directives are not allowed in N-Triples"); + } + charStream = CharStreams.fromString(input); NTriplesLexer lexer = new NTriplesLexer(charStream); + + lexer.removeErrorListener(ConsoleErrorListener.INSTANCE); + lexer.addErrorListener(new NTriplesErrorListener()); + CommonTokenStream tokens = new CommonTokenStream(lexer); NTriplesParser antlrParser = new NTriplesParser(tokens); + + antlrParser.removeErrorListener(ConsoleErrorListener.INSTANCE); + antlrParser.setErrorHandler(new BailErrorStrategy()); + antlrParser.addErrorListener(new NTriplesErrorListener()); + ParseTreeWalker walker = new ParseTreeWalker(); ParseTree tree = antlrParser.ntriplesDoc(); - NTriplesListener listener = new NTriplesListener(getModel(), getValueFactory(), getConfig()); - walker.walk((ParseTreeListener) listener, tree); + walker.walk(listener, tree); + } catch (ParseCancellationException pce) { + if (pce.getCause() instanceof ParsingErrorException cause) { + throw cause; + } + throw new ParsingErrorException("Parsing cancelled due to a syntax error: " + pce.getMessage(), pce); } catch (IOException e) { - throw new ParsingErrorException("Failed to parse N-Triples: " + e.getMessage(), e); + throw new ParsingErrorException("Failed to read N-Triples input: " + e.getMessage(), e); + } catch (IllegalArgumentException e) { + throw new ParsingErrorException("Invalid RDF data: " + e.getMessage(), e); } catch (Exception e) { throw new ParsingErrorException("Unexpected error during N-Triples parsing: " + e.getMessage(), e); } } + + /** + * Static inner class for a custom ANTLR error listener. + * This class throws a ParsingErrorException whenever a syntax error + * or lexical error is encountered. + * This ensures that parsing failures are consistently reported + * via the application's custom exception. + */ + private static class NTriplesErrorListener extends BaseErrorListener { + + @Override + public void syntaxError(Recognizer recognizer, + Object offendingSymbol, + int line, + int charPositionInLine, + String msg, + RecognitionException e) { + String errorMessage = String.format("Syntax error at line %d:%d - %s", + line, charPositionInLine, msg); + throw new ParseCancellationException(new ParsingErrorException(errorMessage, e)); + } + } } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java index 898ca2053..c8f7f5739 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java @@ -7,6 +7,7 @@ import fr.inria.corese.core.next.api.Value; import fr.inria.corese.core.next.api.ValueFactory; import fr.inria.corese.core.next.api.io.IOOptions; +import fr.inria.corese.core.next.impl.exception.ParsingErrorException; // Import the custom exception import fr.inria.corese.core.next.impl.parser.antlr.NTriplesBaseListener; import fr.inria.corese.core.next.impl.parser.antlr.NTriplesParser; @@ -60,7 +61,13 @@ protected Resource extractSubject(NTriplesParser.SubjectContext ctx) { return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); } if (ctx.BLANK_NODE_LABEL() != null) { - return factory.createBNode(ctx.BLANK_NODE_LABEL().getText().substring(2)); + String blankNodeLabel = ctx.BLANK_NODE_LABEL().getText().substring(2); + try { + validateBlankNodeLabel(blankNodeLabel); + } catch (ParsingErrorException e) { + throw new IllegalArgumentException("Invalid blank node label in subject: " + e.getMessage(), e); + } + return factory.createBNode(blankNodeLabel); } throw new IllegalArgumentException("Unsupported N-Triples subject: " + ctx.getText()); } @@ -83,7 +90,13 @@ protected Value extractObject(NTriplesParser.ObjectContext ctx) { return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); } if (ctx.BLANK_NODE_LABEL() != null) { - return factory.createBNode(ctx.BLANK_NODE_LABEL().getText().substring(2)); + String blankNodeLabel = ctx.BLANK_NODE_LABEL().getText().substring(2); + try { + validateBlankNodeLabel(blankNodeLabel); + } catch (ParsingErrorException e) { + throw new IllegalArgumentException("Invalid blank node label in object: " + e.getMessage(), e); + } + return factory.createBNode(blankNodeLabel); } if (ctx.literal() != null) { return extractLiteral(ctx.literal()); @@ -110,6 +123,25 @@ protected Literal extractLiteral(NTriplesParser.LiteralContext ctx) { return factory.createLiteral(label); } + /** + * Validates a blank node label according to RDF N-Triples specification. + * Blank node labels must not be empty and must not contain a colon. + * They *can* start with a digit. + * + * @param label The blank node label (without the `_: `prefix). + * @throws ParsingErrorException if the label is invalid. + */ + protected void validateBlankNodeLabel(String label) throws ParsingErrorException { + if (label == null || label.isEmpty()) { + throw new ParsingErrorException("Blank node label cannot be empty."); + } + + if (label.contains(":")) { + throw new ParsingErrorException("Blank node label cannot contain a colon (':')"); + } + + } + /** * Unescapes common N-Triples literal escape sequences. * This method handles `\"`, `\\`, `\n`, `\t`, `\r`, `\b`, `\f`. @@ -203,7 +235,7 @@ protected String unescapeLiteral(String literalText) { /** * Unescapes common N-Triples URI escape sequences. - * This method handles `\>`, `\\`, `\ uXXXX`, `\UXXXXXXXX`. + * This method handles `\>`, `\\`, `\ nXXXX`, `\UXXXXXXXX`. * * @param uri The escaped URI string. * @return The unescaped URI string. From 015881be19bc8e554d49506e3ce999358919b052 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 19 Aug 2025 13:40:52 +0200 Subject: [PATCH 2/8] correction parser for the N-Quads format --- src/main/antlr/NQuads.g4 | 11 +- .../io/parser/nquads/ANTLRNQuadsParser.java | 78 ++++++++++++- .../impl/io/parser/nquads/NQuadsListener.java | 110 +++++++++++++----- 3 files changed, 159 insertions(+), 40 deletions(-) diff --git a/src/main/antlr/NQuads.g4 b/src/main/antlr/NQuads.g4 index ad04f65ff..8ea9aab18 100644 --- a/src/main/antlr/NQuads.g4 +++ b/src/main/antlr/NQuads.g4 @@ -1,7 +1,7 @@ grammar NQuads; nquadsDoc - : statement? (EOL* statement)* EOL* + : statement* WS* ; statement @@ -41,8 +41,11 @@ EOL ; IRIREF -// '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' - : '<' [a-zA-Z0-9-]+':' ((~( [\u0000-\u0020] | '<' | '>' | '"' | '{'| '}' | '|'| '^'| '`' | '\\' )) | UCHAR)* '>' + : '<' ( IRI_CHAR | UCHAR )* '>' + ; + +fragment IRI_CHAR + : ~ ( '\u0000'..'\u0020' | '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\' ) ; STRING_LITERAL_QUOTE @@ -90,7 +93,7 @@ PN_CHARS_U // PN_CHARS_BASE | '_' | ':' : PN_CHARS_BASE | '_' -// | ':' + | ':' ; PN_CHARS diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java index 742f2aa27..e464c58b5 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java @@ -82,20 +82,88 @@ public void parse(Reader reader, String baseURI) throws ParsingErrorException { try { CharStream charStream = CharStreams.fromReader(reader); NQuadsLexer lexer = new NQuadsLexer(charStream); - CommonTokenStream tokens = new CommonTokenStream(lexer); + + CommonTokenStream tokens = new DirectiveAwareTokenStream(lexer); + + lexer.removeErrorListeners(); + lexer.addErrorListener(ThrowingErrorListener.INSTANCE); NQuadsParser antlrParser = new NQuadsParser(tokens); - ParseTreeWalker walker = new ParseTreeWalker(); + antlrParser.removeErrorListeners(); + antlrParser.addErrorListener(ThrowingErrorListener.INSTANCE); + ParseTree tree = antlrParser.nquadsDoc(); + ParseTreeWalker walker = new ParseTreeWalker(); NQuadsListener listener = new NQuadsListener(getModel(), getValueFactory(), getConfig()); - walker.walk((ParseTreeListener) listener, tree); } catch (IOException e) { throw new ParsingErrorException("Failed to parse N-Quads: " + e.getMessage(), e); - } catch (Exception e) { + } catch (RuntimeException e) { + Throwable current = e; + while (current != null) { + if (current instanceof ParsingErrorException) { + throw (ParsingErrorException) current; + } + current = current.getCause(); + } throw new ParsingErrorException("Unexpected error during N-Quads parsing: " + e.getMessage(), e); } } -} + + /** + * Custom TokenStream to check for and disallow directives like @base and @prefix in N-Quads. + * N-Quads format does not support directives. + */ + private static class DirectiveAwareTokenStream extends CommonTokenStream { + public DirectiveAwareTokenStream(Lexer lexer) { + super(lexer); + } + + @Override + public Token LT(int k) { + Token token = super.LT(k); + if (token != null) { + String text = token.getText(); + if (text != null && (text.startsWith("@base") || text.startsWith("@prefix"))) { + throw new ParsingErrorException("Directive not allowed in N-Quads: " + text); + } + } + return token; + } + } + + /** + * Custom ANTLR ErrorListener that throws a ParsingErrorException on any syntax error. + * This ensures that parsing failures are immediately reported as application-specific exceptions. + */ + private static class ThrowingErrorListener extends BaseErrorListener { + static final ThrowingErrorListener INSTANCE = new ThrowingErrorListener(); + + @Override + public void syntaxError(Recognizer recognizer, Object offendingSymbol, + int line, int charPositionInLine, + String msg, RecognitionException e) { + + if (offendingSymbol != null) { + String symbolText = offendingSymbol.toString(); + + if (msg != null && msg.contains("token recognition error") && symbolText.equals("':'")) { + throw new ParsingErrorException("Invalid blank node label: colon not allowed (line " + line + ")"); + } + + if (msg != null && msg.contains("no viable alternative") && symbolText.contains("_:")) { + throw new ParsingErrorException("Invalid blank node label: colon not allowed (line " + line + ")"); + } + + if (symbolText.contains("_:") && symbolText.contains(":") && !symbolText.equals("_:")) { + throw new ParsingErrorException("Invalid blank node label: colon not allowed (line " + line + ")"); + } + } + + throw new ParsingErrorException( + String.format("line %d:%d %s", line, charPositionInLine, msg)); + } + } +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java index 6280d47a8..89a8d7560 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java @@ -7,6 +7,7 @@ import fr.inria.corese.core.next.api.Value; import fr.inria.corese.core.next.api.ValueFactory; import fr.inria.corese.core.next.api.io.IOOptions; +import fr.inria.corese.core.next.impl.exception.ParsingErrorException; import fr.inria.corese.core.next.impl.parser.antlr.NQuadsBaseListener; import fr.inria.corese.core.next.impl.parser.antlr.NQuadsParser; @@ -39,21 +40,22 @@ public NQuadsListener(Model model, ValueFactory factory, IOOptions options) { this.options = options; } + + + /** + * Exits a statement context, extracting the object and adding the complete triple/quad to the model. + * Resets the current subject, predicate, and graph. + * @param ctx The StatementContext from the ANTLR parse tree. + */ @Override public void enterStatement(NQuadsParser.StatementContext ctx) { - currentSubject = extractSubject(ctx.subject()); currentPredicate = extractPredicate(ctx.predicate()); - if (ctx.graphLabel() != null) { - currentGraph = extractGraph(ctx.graphLabel()); - } else { - currentGraph = null; - } + currentGraph = (ctx.graphLabel() != null) ? extractGraph(ctx.graphLabel()) : null; } @Override public void exitStatement(NQuadsParser.StatementContext ctx) { - Value object = extractObject(ctx.object()); if (currentGraph != null) { model.add(currentSubject, currentPredicate, object, currentGraph); @@ -67,59 +69,82 @@ public void exitStatement(NQuadsParser.StatementContext ctx) { /** * Extracts a resource (IRI or Blank Node) from the subject context. + * Handles unescaping of URI characters for IRIs and extracts blank node labels. + * @param ctx The SubjectContext from the ANTLR parse tree. + * @return The created Resource (IRI or BNode). + * @throws ParsingErrorException if the subject type is unsupported or blank node label is invalid. */ protected Resource extractSubject(NQuadsParser.SubjectContext ctx) { if (ctx.IRIREF() != null) { - return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); + return factory.createIRI(unescapeUri(stripAngles(ctx.IRIREF().getText()))); } if (ctx.BLANK_NODE_LABEL() != null) { - return factory.createBNode(ctx.BLANK_NODE_LABEL().getText().substring(2)); + String label = ctx.BLANK_NODE_LABEL().getText().substring(2); + validateBlankNodeLabel(label); + return factory.createBNode(label); } - throw new IllegalArgumentException("Unsupported N-Quads subject: " + ctx.getText()); + throw new ParsingErrorException("Unsupported N-Quads subject: " + ctx.getText()); } - /** * Extracts a predicate (IRI) from the predicate context. + * Handles unescaping of URI characters. + * @param ctx The PredicateContext from the ANTLR parse tree. + * @return The created IRI. + * @throws ParsingErrorException if the predicate type is unsupported. */ protected IRI extractPredicate(NQuadsParser.PredicateContext ctx) { if (ctx.IRIREF() != null) { - return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); + return factory.createIRI(unescapeUri(stripAngles(ctx.IRIREF().getText()))); } - throw new IllegalArgumentException("Unsupported N-Quads predicate: " + ctx.getText()); + throw new ParsingErrorException("Unsupported N-Quads predicate: " + ctx.getText()); } /** * Extracts a value (IRI, Blank Node, or Literal) from the object context. + * Delegates to specific extraction methods based on the object type. + * @param ctx The ObjectContext from the ANTLR parse tree. + * @return The created Value (IRI, BNode, or Literal). + * @throws ParsingErrorException if the object type is unsupported or blank node label is invalid. */ protected Value extractObject(NQuadsParser.ObjectContext ctx) { if (ctx.IRIREF() != null) { - return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); + return factory.createIRI(unescapeUri(stripAngles(ctx.IRIREF().getText()))); } if (ctx.BLANK_NODE_LABEL() != null) { - return factory.createBNode(ctx.BLANK_NODE_LABEL().getText().substring(2)); + String label = ctx.BLANK_NODE_LABEL().getText().substring(2); + validateBlankNodeLabel(label); + return factory.createBNode(label); } if (ctx.literal() != null) { return extractLiteral(ctx.literal()); } - throw new IllegalArgumentException("Unsupported N-Quads object: " + ctx.getText()); + throw new ParsingErrorException("Unsupported N-Quads object: " + ctx.getText()); } /** * Extracts a graph (IRI or Blank Node) from the graph context. + * Handles unescaping of URI characters for IRIs and extracts blank node labels. + * @param ctx The GraphLabelContext from the ANTLR parse tree. + * @return The created Resource (IRI or BNode) representing the graph. + * @throws ParsingErrorException if the graph label type is unsupported or blank node label is invalid. */ protected Resource extractGraph(NQuadsParser.GraphLabelContext ctx) { if (ctx.IRIREF() != null) { - return factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); + return factory.createIRI(unescapeUri(stripAngles(ctx.IRIREF().getText()))); } if (ctx.BLANK_NODE_LABEL() != null) { - return factory.createBNode(ctx.BLANK_NODE_LABEL().getText().substring(2)); + String label = ctx.BLANK_NODE_LABEL().getText().substring(2); + validateBlankNodeLabel(label); + return factory.createBNode(label); } - throw new IllegalArgumentException("Unsupported N-Quads graph: " + ctx.getText()); + throw new ParsingErrorException("Unsupported N-Quads graph: " + ctx.getText()); } /** * Extracts and unescapes a literal from the ANTLR context. * This method handles string literals with or without datatype/language. + * @param ctx The LiteralContext from the ANTLR parse tree. + * @return The created Literal value. */ protected Literal extractLiteral(NQuadsParser.LiteralContext ctx) { String label = ctx.STRING_LITERAL_QUOTE().getText(); @@ -139,12 +164,12 @@ protected Literal extractLiteral(NQuadsParser.LiteralContext ctx) { /** * Unescapes common N-Quads literal escape sequences. - * This method handles `\"`, `\\`, `\n`, `\t`, `\r`, `\b`, `\f`. - * It also handles `\ uXXXX` and `\UXXXXXXXX` for Unicode escapes. + * This method handles \", \\, \n, \t, \r, \b, \f. * It also removes the surrounding quotes from the literal string. * * @param literalText The raw literal string from ANTLR (including quotes and escapes). * @return The unescaped literal string without surrounding quotes. + * @throws ParsingErrorException if an invalid Unicode escape sequence is found. */ protected String unescapeLiteral(String literalText) { String unquotedLiteral = literalText.substring(1, literalText.length() - 1); @@ -191,10 +216,10 @@ protected String unescapeLiteral(String literalText) { sb.append((char) unicodeChar); i += 5; } catch (NumberFormatException e) { - throw new IllegalArgumentException("Invalid \\uXXXX escape sequence in literal: \\u" + hex); + throw new ParsingErrorException("Invalid \\uXXXX escape sequence in literal: \\u" + hex); } } else { - throw new IllegalArgumentException("Incomplete \\uXXXX escape sequence in literal: " + unquotedLiteral.substring(i)); + throw new ParsingErrorException("Incomplete \\uXXXX escape sequence in literal: " + unquotedLiteral.substring(i)); } break; case 'U': @@ -210,10 +235,10 @@ protected String unescapeLiteral(String literalText) { } i += 9; } catch (NumberFormatException e) { - throw new IllegalArgumentException("Invalid \\UXXXXXXXX escape sequence in literal: \\U" + hex); + throw new ParsingErrorException("Invalid \\UXXXXXXXX escape sequence in literal: \\U" + hex); } } else { - throw new IllegalArgumentException("Incomplete \\UXXXXXXXX escape sequence in literal: " + unquotedLiteral.substring(i)); + throw new ParsingErrorException("Incomplete \\UXXXXXXXX escape sequence in literal: " + unquotedLiteral.substring(i)); } break; default: @@ -230,10 +255,11 @@ protected String unescapeLiteral(String literalText) { /** * Unescapes common N-Quads URI escape sequences. - * This method handles `\>`, `\\`, `\ uXXXX`, `\UXXXXXXXX`. + * This method handles \>, \\, \ uXXXX, \UXXXXXXXX. * * @param uri The escaped URI string. * @return The unescaped URI string. + * @throws ParsingErrorException if an invalid Unicode escape sequence is found. */ protected String unescapeUri(String uri) { StringBuilder sb = new StringBuilder(); @@ -258,10 +284,10 @@ protected String unescapeUri(String uri) { sb.append((char) unicodeChar); i += 5; } catch (NumberFormatException e) { - throw new IllegalArgumentException("Invalid \\uXXXX escape sequence in URI: \\u" + hex); + throw new ParsingErrorException("Invalid \\uXXXX escape sequence in URI: \\u" + hex); } } else { - throw new IllegalArgumentException("Incomplete \\uXXXX escape sequence in URI: " + uri.substring(i)); + throw new ParsingErrorException("Incomplete \\uXXXX escape sequence in URI: " + uri.substring(i)); } break; case 'U': @@ -277,10 +303,10 @@ protected String unescapeUri(String uri) { } i += 9; } catch (NumberFormatException e) { - throw new IllegalArgumentException("Invalid \\UXXXXXXXX escape sequence in URI: \\U" + hex); + throw new ParsingErrorException("Invalid \\UXXXXXXXX escape sequence in URI: \\U" + hex); } } else { - throw new IllegalArgumentException("Incomplete \\UXXXXXXXX escape sequence in URI: " + uri.substring(i)); + throw new ParsingErrorException("Incomplete \\UXXXXXXXX escape sequence in URI: " + uri.substring(i)); } break; default: @@ -294,4 +320,26 @@ protected String unescapeUri(String uri) { } return sb.toString(); } -} + private String stripAngles(String iriRef) { + return iriRef.substring(1, iriRef.length() - 1); + } + /** + * Validates a blank node label according to RDF 1.1 N-Quads specification. + * Blank node labels must match PN_LOCAL rules, which means they cannot be empty, + * and cannot contain colons. They *can* start with a digit. + * @param label The blank node label string (without the "_:" prefix). + * @throws ParsingErrorException if the blank node label is invalid. + */ + protected void validateBlankNodeLabel(String label) { + if (label.isEmpty()) { + throw new ParsingErrorException("Blank node label cannot be empty"); + } + if (label.contains(":")) { + throw new ParsingErrorException("Blank node label cannot contain colon"); + } + + if (!label.matches("^[A-Za-z_0-9][A-Za-z0-9_\\-\\.]*$")) { + throw new ParsingErrorException("Invalid blank node label syntax: " + label); + } + } +} \ No newline at end of file From b9c63a94249f8fb609cd7debb11ef2b1a090637b Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 19 Aug 2025 13:52:37 +0200 Subject: [PATCH 3/8] correction parser for the N-Quads format --- .../core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java | 4 +--- .../next/impl/io/parser/nquads/NQuadsListenerTest.java | 7 ++++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java index e464c58b5..99af024de 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java @@ -8,9 +8,7 @@ import fr.inria.corese.core.next.impl.exception.ParsingErrorException; import fr.inria.corese.core.next.impl.parser.antlr.NQuadsLexer; import fr.inria.corese.core.next.impl.parser.antlr.NQuadsParser; -import org.antlr.v4.runtime.CharStream; -import org.antlr.v4.runtime.CharStreams; -import org.antlr.v4.runtime.CommonTokenStream; +import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.tree.ParseTree; import org.antlr.v4.runtime.tree.ParseTreeListener; import org.antlr.v4.runtime.tree.ParseTreeWalker; diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListenerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListenerTest.java index 9961f68ba..1890222b4 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListenerTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListenerTest.java @@ -2,6 +2,7 @@ import fr.inria.corese.core.next.api.*; import fr.inria.corese.core.next.api.io.IOOptions; +import fr.inria.corese.core.next.impl.exception.ParsingErrorException; import fr.inria.corese.core.next.impl.parser.antlr.NQuadsParser; import org.antlr.v4.runtime.ParserRuleContext; import org.antlr.v4.runtime.tree.TerminalNode; @@ -310,7 +311,7 @@ void testUnescapeLiteralInvalidUx() throws NoSuchMethodException { String input = "\"Invalid\\U0000XXX\""; java.lang.reflect.Method method = NQuadsListener.class.getDeclaredMethod("unescapeLiteral", String.class); method.setAccessible(true); - assertThrows(IllegalArgumentException.class, + assertThrows(ParsingErrorException.class, () -> listener.unescapeLiteral(input), "Should throw for malformed \\UXXXXXXXX escape sequence"); } @@ -346,13 +347,13 @@ void testUnescapeUriUnicodeU() throws NoSuchMethodException, java.lang.reflect.I @Test - @DisplayName("unescapeUri should throw IllegalArgumentException for invalid \\uXXXX") + @DisplayName("unescapeUri should throw ParsingErrorException for invalid \\uXXXX") void testUnescapeUriInvalidU() throws NoSuchMethodException { String input = "http://example.org/invalid\\uXXX"; java.lang.reflect.Method method = NQuadsListener.class.getDeclaredMethod("unescapeUri", String.class); method.setAccessible(true); - assertThrows(IllegalArgumentException.class, + assertThrows(ParsingErrorException.class, () -> listener.unescapeLiteral(input), "Should throw unescapeUri should throw IllegalArgumentException for invalid \\uXXXX"); From d941dd2cb68b074191740e7507967186bbe2d4d4 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Wed, 20 Aug 2025 09:08:08 +0200 Subject: [PATCH 4/8] correction parser for the N-Triples format --- .../core/next/impl/io/parser/ntriples/NTriplesListener.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java index c8f7f5739..b9d58d6ea 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/ntriples/NTriplesListener.java @@ -7,7 +7,7 @@ import fr.inria.corese.core.next.api.Value; import fr.inria.corese.core.next.api.ValueFactory; import fr.inria.corese.core.next.api.io.IOOptions; -import fr.inria.corese.core.next.impl.exception.ParsingErrorException; // Import the custom exception +import fr.inria.corese.core.next.impl.exception.ParsingErrorException; import fr.inria.corese.core.next.impl.parser.antlr.NTriplesBaseListener; import fr.inria.corese.core.next.impl.parser.antlr.NTriplesParser; From d8d141a5db23a00efc8e90a09b9285bb0b840918 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Wed, 20 Aug 2025 14:40:28 +0200 Subject: [PATCH 5/8] correction parser for the N-Quads format --- src/main/antlr/NQuads.g4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/antlr/NQuads.g4 b/src/main/antlr/NQuads.g4 index 8ea9aab18..431e6a110 100644 --- a/src/main/antlr/NQuads.g4 +++ b/src/main/antlr/NQuads.g4 @@ -1,7 +1,7 @@ grammar NQuads; nquadsDoc - : statement* WS* + : statement? (EOL* statement)* EOL* ; statement From 305e0f009d2e64b0e090c2520c7827567975113a Mon Sep 17 00:00:00 2001 From: Pierre Maillot Date: Thu, 28 Aug 2025 09:59:32 +0200 Subject: [PATCH 6/8] fringe test for weird litteral containg forbiden keywords --- .../io/parser/nquads/ANTLRNQuadsParserTest.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java index e0a26f63e..93506d5c7 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java @@ -194,4 +194,17 @@ void testParseUnicodeEscapeIRIUxInGraph() throws ParsingErrorException { parser.parse(new StringReader(nquad)); verify(mockModel).add(mockSubjectIRI, mockPredicateIRI, mockObjectIRI, expectedGraphIRI); } + + @Test + @DisplayName("Test parsing a document that contains a literal that is a malformed document") + void testMalformedDocumentInception() throws ParsingErrorException { + String doc = """ + . + \"\"\"@base . + . + """; + StringReader reader = new StringReader(doc); + parser.parse(reader); + } } From 482517948a7a568d0d522521ef517d02523a56ad Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Thu, 28 Aug 2025 16:43:49 +0200 Subject: [PATCH 7/8] correction parser for the N-Quads format --- src/main/antlr/NQuads.g4 | 3 +- .../io/parser/nquads/ANTLRNQuadsParser.java | 23 +-------------- .../impl/io/parser/nquads/NQuadsListener.java | 28 +++++++++++++++---- .../parser/nquads/ANTLRNQuadsParserTest.java | 3 ++ 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/main/antlr/NQuads.g4 b/src/main/antlr/NQuads.g4 index 431e6a110..19e4648ed 100644 --- a/src/main/antlr/NQuads.g4 +++ b/src/main/antlr/NQuads.g4 @@ -49,7 +49,8 @@ fragment IRI_CHAR ; STRING_LITERAL_QUOTE - : '"' ( ~( [\u0022] | [\u005C] | [\u000A] | [\u000D] ) | ECHAR | UCHAR )* '"' + : '"""' ( ~('"') | '"' ~('"') | '""' ~('"') | ECHAR | UCHAR )* '"""' + | '"' ( ~( [\u0022] | [\u005C] | [\u000A] | [\u000D] ) | ECHAR | UCHAR )* '"' ; BLANK_NODE_LABEL diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java index 99af024de..29bdea34e 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParser.java @@ -81,7 +81,7 @@ public void parse(Reader reader, String baseURI) throws ParsingErrorException { CharStream charStream = CharStreams.fromReader(reader); NQuadsLexer lexer = new NQuadsLexer(charStream); - CommonTokenStream tokens = new DirectiveAwareTokenStream(lexer); + CommonTokenStream tokens = new CommonTokenStream(lexer); lexer.removeErrorListeners(); lexer.addErrorListener(ThrowingErrorListener.INSTANCE); @@ -110,27 +110,6 @@ public void parse(Reader reader, String baseURI) throws ParsingErrorException { } } - /** - * Custom TokenStream to check for and disallow directives like @base and @prefix in N-Quads. - * N-Quads format does not support directives. - */ - private static class DirectiveAwareTokenStream extends CommonTokenStream { - public DirectiveAwareTokenStream(Lexer lexer) { - super(lexer); - } - - @Override - public Token LT(int k) { - Token token = super.LT(k); - if (token != null) { - String text = token.getText(); - if (text != null && (text.startsWith("@base") || text.startsWith("@prefix"))) { - throw new ParsingErrorException("Directive not allowed in N-Quads: " + text); - } - } - return token; - } - } /** * Custom ANTLR ErrorListener that throws a ParsingErrorException on any syntax error. diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java index 89a8d7560..4e9518be9 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java @@ -147,11 +147,17 @@ protected Resource extractGraph(NQuadsParser.GraphLabelContext ctx) { * @return The created Literal value. */ protected Literal extractLiteral(NQuadsParser.LiteralContext ctx) { - String label = ctx.STRING_LITERAL_QUOTE().getText(); - label = unescapeLiteral(label); + String rawLiteralText; + if (ctx.STRING_LITERAL_QUOTE() != null) { + rawLiteralText = ctx.STRING_LITERAL_QUOTE().getText(); + } + else { + throw new ParsingErrorException("Unsupported literal type or missing literal token: " + ctx.getText()); + } + String label = unescapeLiteral(rawLiteralText); if (ctx.IRIREF() != null) { - IRI datatype = factory.createIRI(unescapeUri(ctx.IRIREF().getText().substring(1, ctx.IRIREF().getText().length() - 1))); + IRI datatype = factory.createIRI(unescapeUri(stripAngles(ctx.IRIREF().getText()))); return factory.createLiteral(label, datatype); } if (ctx.LANGTAG() != null) { @@ -172,7 +178,19 @@ protected Literal extractLiteral(NQuadsParser.LiteralContext ctx) { * @throws ParsingErrorException if an invalid Unicode escape sequence is found. */ protected String unescapeLiteral(String literalText) { - String unquotedLiteral = literalText.substring(1, literalText.length() - 1); + String unquotedLiteral; + int quoteLength = 0; + + if (literalText.startsWith("\"\"\"") && literalText.endsWith("\"\"\"")) { // Triple quotes + quoteLength = 3; + } else if (literalText.startsWith("\"") && literalText.endsWith("\"")) { // Single quotes + quoteLength = 1; + } else { + + throw new ParsingErrorException("Literal text does not start/end with expected N-Quads quotes: " + literalText); + } + + unquotedLiteral = literalText.substring(quoteLength, literalText.length() - quoteLength); StringBuilder sb = new StringBuilder(); for (int i = 0; i < unquotedLiteral.length(); i++) { @@ -342,4 +360,4 @@ protected void validateBlankNodeLabel(String label) { throw new ParsingErrorException("Invalid blank node label syntax: " + label); } } -} \ No newline at end of file +} diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java index 93506d5c7..539d54539 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/parser/nquads/ANTLRNQuadsParserTest.java @@ -91,6 +91,7 @@ void setUp() { lenient().when(mockValueFactory.createLiteral(eq("hello"), eq("en"))).thenReturn(mockLangLiteral); lenient().when(mockValueFactory.createLiteral(eq("123"), any(IRI.class))).thenReturn(mockTypedLiteral); lenient().when(mockValueFactory.createLiteral(eq("literal with \"quotes\" and \n newline"))).thenReturn(mockEscapedLiteral); + lenient().when(mockValueFactory.createLiteral(eq("@base .\n Date: Fri, 29 Aug 2025 15:24:10 +0200 Subject: [PATCH 8/8] =?UTF-8?q?pour=20=C3=A9viter=20les=20erreurs=20d'inde?= =?UTF-8?q?x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/impl/io/parser/nquads/NQuadsListener.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java index 4e9518be9..1cca33d6f 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/nquads/NQuadsListener.java @@ -179,14 +179,18 @@ protected Literal extractLiteral(NQuadsParser.LiteralContext ctx) { */ protected String unescapeLiteral(String literalText) { String unquotedLiteral; - int quoteLength = 0; - - if (literalText.startsWith("\"\"\"") && literalText.endsWith("\"\"\"")) { // Triple quotes + int quoteLength; + if (literalText.startsWith("\"\"\"") && literalText.endsWith("\"\"\"")) { + if (literalText.length() < 6) { + throw new ParsingErrorException("Invalid triple-quoted string"); + } quoteLength = 3; - } else if (literalText.startsWith("\"") && literalText.endsWith("\"")) { // Single quotes + } else if (literalText.startsWith("\"") && literalText.endsWith("\"")) { + if (literalText.length() < 2) { + throw new ParsingErrorException("Invalid single-quoted string"); + } quoteLength = 1; } else { - throw new ParsingErrorException("Literal text does not start/end with expected N-Quads quotes: " + literalText); }