diff --git a/build.gradle.kts b/build.gradle.kts index 5f8a27673..55fa51cbb 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -15,6 +15,7 @@ plugins { id("com.gradleup.shadow") version "8.3.7" id("org.sonarqube") version "6.1.0.5360" // SonarQube integration id("com.intershop.gradle.javacc") version "5.0.1" // JavaCC plugin for parsing JavaCC files + id("antlr") // Antlr plugin for generating parsers from grammar files } // SonarQube configuration @@ -69,11 +70,11 @@ object Meta { // Project description const val desc = "Corese is a Semantic Web Factory (triple store and SPARQL endpoint) implementing RDF, RDFS, SPARQL 1.1 Query and Update, Shacl. STTL. LDScript." const val githubRepo = "corese-stack/corese-core" - + // License information const val license = "CeCILL-C License" const val licenseUrl = "https://opensource.org/licenses/CeCILL-C" - + // Sonatype OSSRH publishing settings const val release = "https://oss.sonatype.org/service/local/staging/deploy/maven2/" const val snapshot = "https://oss.sonatype.org/content/repositories/snapshots/" @@ -113,6 +114,10 @@ dependencies { implementation("fr.inria.corese.org.semarglproject:semargl-rdfa:0.7.2") // RDFa parser (Semargl) implementation("com.github.jsonld-java:jsonld-java:0.13.4") // JSON-LD processing + // === Antlr dependencies === + antlr("org.antlr:antlr4:4.13.2") // Antlr for parsing (ANTLR 4) + implementation("org.antlr:antlr4-runtime:4.13.2") // Antlr runtime for parsing + // === HTTP and XML === implementation("org.glassfish.jersey.core:jersey-client:3.1.10") // HTTP client (Jersey) implementation("org.glassfish.jersey.inject:jersey-hk2:3.1.10") // Dependency injection for Jersey @@ -143,7 +148,7 @@ publishing { // Configure the publication to include JAR, sources, and Javadoc from(components["java"]) - // Configures version mapping to control how dependency versions are resolved + // Configures version mapping to control how dependency versions are resolved // for different usage contexts (API and runtime). versionMapping { // Defines version mapping for Java API usage. @@ -270,7 +275,7 @@ tasks.withType { tasks { shadowJar { this.archiveClassifier = "jar-with-dependencies" - } + } } // Configure Javadoc tasks to disable doclint warnings. @@ -321,3 +326,37 @@ tasks.withType().configureEach { tasks.withType().configureEach { dependsOn(tasks.withType()) } + +// === Antlr generated sources configuration === + +// Path where Antlr will generate sources +val generatedSourcesPath = "src/main/generated" + +// Add the generated sources directory to the main source set +sourceSets["main"].java.srcDir(file(generatedSourcesPath)) + +// Configure the Antlr task to generate parser code with specific arguments +tasks.named("generateGrammarSource") { + arguments.addAll(listOf("-visitor", "-long-messages", "-package", "fr.inria.corese.core.next.impl.parser.antlr")) + outputDirectory = file("$buildDir/generated-src/antlr/main") + outputs.dirs(outputDirectory) +} + +// Ensure Java compilation depends on Antlr code generation +tasks.named("compileJava") { + dependsOn("generateGrammarSource" /*, "copyAntlrGenerated" */) +} + +// Ensure sources JAR includes generated sources and depends on Antlr code generation +tasks.named("sourcesJar") { + dependsOn("generateGrammarSource" /*, "copyAntlrGenerated" */) + from(generatedSourcesPath) + includeEmptyDirs = false +} + +// Clean up generated sources on clean +tasks.clean { + doLast { + file(generatedSourcesPath).deleteRecursively() + } +} diff --git a/src/main/antlr/Turtle.g4 b/src/main/antlr/Turtle.g4 new file mode 100644 index 000000000..a51350b3e --- /dev/null +++ b/src/main/antlr/Turtle.g4 @@ -0,0 +1,324 @@ +/* + [The "BSD licence"] + Copyright (c) 2014, Alejandro Medrano (@ Universidad Politecnica de Madrid, http://www.upm.es/) + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +*/ +/* Derived from http://www.w3.org/TR/turtle/#sec-grammar-grammar */ + +// $antlr-format alignTrailingComments true, columnLimit 150, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments false, useTab false +// $antlr-format allowShortRulesOnASingleLine false, allowShortBlocksOnASingleLine true, alignSemicolons hanging, alignColons hanging + +grammar Turtle; + +turtleDoc + : statement* EOF + ; + +statement + : directive + | triples '.' + ; + +directive + : prefixID + | base + | sparqlPrefix + | sparqlBase + ; + +triples + : subject predicateObjectList + | blankNodePropertyList predicateObjectList? + ; + +predicateObjectList + : verb objectList (';' (verb objectList)?)* + ; + +objectList + : object_ (',' object_)* + ; + +verb + : predicate + | 'a' + ; + +subject + : iri + | BlankNode + | collection + ; + +predicate + : iri + ; + +object_ + : iri + | BlankNode + | collection + | blankNodePropertyList + | literal + ; + +literal + : rdfLiteral + | numericLiteral + | BooleanLiteral + ; + +blankNodePropertyList + : '[' predicateObjectList ']' + ; + +collection + : '(' object_* ')' + ; + +prefixID + : '@prefix' PNAME_NS IRIREF '.' + ; + +base + : '@base' IRIREF '.' + ; + +sparqlBase + : Base_w IRIREF + ; + +sparqlPrefix + : Prefix_w PNAME_NS IRIREF + ; + +numericLiteral + : INTEGER + | DECIMAL + | DOUBLE + ; + +rdfLiteral + : string (LANGTAG | '^^' iri)? + ; + +BooleanLiteral + : 'true' + | 'false' + ; + +string + : STRING_LITERAL_QUOTE + | STRING_LITERAL_SINGLE_QUOTE + | STRING_LITERAL_LONG_SINGLE_QUOTE + | STRING_LITERAL_LONG_QUOTE + ; + +iri + : IRIREF + | PrefixedName + ; + +BlankNode + : BLANK_NODE_LABEL + | ANON + ; + +WS + : ([\t\r\n\u000C] | ' ')+ -> skip + ; + +// LEXER + +Base_w options { caseInsensitive=true; } + : 'BASE' + ; + +Prefix_w options { caseInsensitive=true; } + : 'PREFIX' + ; + +// PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? +// Prefix without the final ':' +PN_PREFIX + : PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? + ; + +IRIREF + : '<' ((~( '\u0000' | '\u0020' | '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' |'\\' )) | UCHAR)* '>' + ; + +// Prefix alone +PNAME_NS + : PN_PREFIX? ':' + ; + +PrefixedName + : PNAME_LN + | PNAME_NS + ; + +// Prefix + local name +PNAME_LN + : PNAME_NS PN_LOCAL + ; + +BLANK_NODE_LABEL + : '_:' (PN_CHARS_U | '0' .. '9') ((PN_CHARS | '.')* PN_CHARS)? + ; + +LANGTAG + : '@' ('a'.. 'z' | 'A' .. 'Z')+ ('-' ('a'.. 'z' | 'A' .. 'Z' | '0' .. '9')* )* + ; + +INTEGER + : ('+' | '-' )? ('0' .. '9')+ + ; + +DECIMAL + : ('+' | '-' )? ('0' .. '9')* '.' ('0' .. '9')+ + ; + +DOUBLE + : ('+' | '-' )? (('0' .. '9')+ '.' ('0' .. '9')* EXPONENT + | '.' ('0' .. '9')+ EXPONENT + | ('0' .. '9')+ EXPONENT) + ; + +EXPONENT + : ('e' | 'E') ('+' | '-' )? ('0' .. '9')+ + ; + +// "'''" (("'" | "''")? ([^'\] | ECHAR | UCHAR))* "'''" +STRING_LITERAL_LONG_SINGLE_QUOTE + : '\'\'\'' ( ('\'' '\''? )? ( [^'\\] | ECHAR | UCHAR | '"' ) )* '\'\'\'' + ; + +// '"""' (('"' | '""')? ([^"\] | ECHAR | UCHAR))* '"""' +STRING_LITERAL_LONG_QUOTE + : '"""' ( ('"' '"'? )? ( (~["\\]) | ECHAR | UCHAR )+ )* '"""' + ; + +STRING_LITERAL_QUOTE + : '"' (~ [\u0027\u005C\u000A\u000D] | ECHAR | UCHAR | '"')* '"' + ; + +STRING_LITERAL_SINGLE_QUOTE + : '\'' (~ [\u0027\u005C\u000A\u000D] | ECHAR | UCHAR | '\'')* '\'' + ; + +// Hexadecimal unicode character +UCHAR + : '\\u' HEX HEX HEX HEX + | '\\U' HEX HEX HEX HEX HEX HEX HEX HEX + ; + +// Escaped character +ECHAR + : '\\' [tbnrf"'\\] + ; + +ANON_WS + : ' ' + | '\t' + | '\r' + | '\n' + ; + +ANON + : '[' ANON_WS* ']' + ; + +PN_CHARS_BASE + : 'A' .. 'Z' + | 'a' .. 'z' + | '\u00C0' .. '\u00D6' + | '\u00D8' .. '\u00F6' + | '\u00F8' .. '\u02FF' + | '\u0370' .. '\u037D' + | '\u037F' .. '\u1FFF' + | '\u200C' .. '\u200D' + | '\u2070' .. '\u218F' + | '\u2C00' .. '\u2FEF' + | '\u3001' .. '\uD7FF' + | '\uF900' .. '\uFDCF' + | '\uFDF0' .. '\uFFFD' +// | '\u10000' .. '\uEFFFF' + ; + +PN_CHARS_U + : PN_CHARS_BASE + | '_' + ; + +PN_CHARS + : PN_CHARS_U + | '-' + | [0-9] + | [\u00B7] + | [\u0300-\u036F] + | [\u203F-\u2040] + ; + +PN_LOCAL + : (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? + ; + +PLX + : PERCENT + | PN_LOCAL_ESC + ; + +PERCENT + : '%' HEX HEX + ; + +HEX + : [0-9] + | [A-F] + | [a-f] + ; + +PN_LOCAL_ESC + : '\\' ( + '_' + | '~' + | '.' + | '-' + | '!' + | '$' + | '&' + | '\'' + | '(' + | ')' + | '*' + | '+' + | ',' + | ';' + | '=' + | '/' + | '?' + | '#' + | '@' + | '%' + ) + ; + +LC + : '#' ~[\r\n]* -> channel(HIDDEN) + ; + diff --git a/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormat.java b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormat.java new file mode 100644 index 000000000..379fba182 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormat.java @@ -0,0 +1,114 @@ +package fr.inria.corese.core.next.api.base.parser; + +import java.nio.charset.Charset; +import java.util.*; +import fr.inria.corese.core.next.api.IRI; + +public interface RDFFormat { + + /** + * Gets the name of this file format. + * + * @return A human-readable format name, e.g. "PLAIN TEXT". + */ + String getName(); + + + /** + * Gets the default MIME type for this file format. + * + * @return A MIME type string, e.g. "text/plain". + */ + String getDefaultMIMEType() ; + + + /** + * Checks if the specified MIME type matches the FileFormat's default MIME type. The MIME types are compared + * ignoring upper/lower-case differences. + * + * @param mimeType The MIME type to compare to the FileFormat's default MIME type. + * @return true if the specified MIME type matches the FileFormat's default MIME type. + */ + boolean hasDefaultMIMEType(String mimeType); + + /** + * Gets the file format's MIME types. + * + * @return An unmodifiable list of MIME type strings, e.g. "text/plain". + */ + List getMIMETypes(); + + + + /** + * Checks if specified MIME type matches one of the FileFormat's MIME types. The MIME types are compared ignoring + * upper/lower-case differences. + * + * @param mimeType The MIME type to compare to the FileFormat's MIME types. + * @return true if the specified MIME type matches one of the FileFormat's MIME types. + */ + boolean hasMIMEType(String mimeType); + + /** + * Gets the default file name extension for this file format. + * + * @return A file name extension (excluding the dot), e.g. "txt", or null if there is no common file + * extension for the format. + */ + String getDefaultFileExtension(); + + /** + * Checks if the specified file name extension matches the FileFormat's default file name extension. The file name + * extension MIME types are compared ignoring upper/lower-case differences. + * + * @param extension The file extension to compare to the FileFormat's file extension. + * @return true if the file format has a default file name extension and if it matches the specified + * extension, false otherwise. + */ + boolean hasDefaultFileExtension(String extension); + + /** + * Gets the file format's file extensions. + * + * @return An unmodifiable list of file extension strings, e.g. "txt". + */ + List getFileExtensions(); + + /** + * Checks if the FileFormat's file extension is equal to the specified file extension. The file extensions are + * compared ignoring upper/lower-case differences. + * + * @param extension The file extension to compare to the FileFormat's file extension. + * @return true if the specified file extension is equal to the FileFormat's file extension. + */ + boolean hasFileExtension(String extension); + + /** + * Get the (default) charset for this file format. + * + * @return the (default) charset for this file format, or null if this format does not have a default charset. + */ + Charset getCharset(); + + /** + * Checks if the FileFormat has a (default) charset. + * + * @return true if the FileFormat has a (default) charset. + */ + boolean hasCharset(); + + /** + * Return true if the RDFFormat supports the encoding of namespace/prefix information. + */ + boolean supportsNamespaces(); + + /** + * Return true if the RDFFormat supports the encoding of contexts/named graphs. + */ + boolean supportsContexts(); + + /** + * Return true if the RDFFormat supports the encoding of RDF-star triples natively. + */ + boolean supportsRDFStar(); +} diff --git a/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormats.java b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormats.java new file mode 100644 index 000000000..d8d99d8b0 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFFormats.java @@ -0,0 +1,255 @@ +package fr.inria.corese.core.next.api.base.parser; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.List; + +public enum RDFFormats implements RDFFormat { + + TURTLE("Turtle", + List.of("text/turtle"), + List.of("ttl"), + true, + false, + false), + N3("N3", + List.of("text/n3"), + List.of("n3"), + true, + false, + false), + RDF_XML("RDF/XML", + List.of("application/rdf+xml"), + List.of("rdf", "xml"), + true, + false, + false), + JSON_LD("JSON-LD", + List.of("application/ld+json"), + List.of("jsonld", "json"), + true, + true, + false), + N_TRIPLES("N-Triples", + List.of("application/n-triples"), + List.of("nt"), + false, + false, + false), + TRIG("TriG", + List.of("application/trig"), + List.of("trig"), + true, + true, + false), + NQUADS("N-Quads", + List.of("application/n-quads"), + List.of("nq"), + true, + true, + false); + + private static final boolean DEFAULT_SUPPORTS_NAMESPACES = true; + private static final boolean DEFAULT_SUPPORTS_CONTEXTS = true; + private static final boolean DEFAULT_SUPPORTS_RDF_STAR = false; + + /** + * The file format human-readable name. + */ + private final String name; + + /** + * The file format's MIME types. The first item in the list is interpreted as the default MIME type for the format. + */ + private final List mimeTypes; + + /** + * The file format's (default) charset. + */ + private final Charset charset; + + /** + * The file format's file extensions. The first item in the list is interpreted as the default file extension for + * the format. + */ + private final List fileExtensions; + + /** + * Flag indicating whether the RDFFormat can encode namespace information. + */ + private final boolean supportsNamespaces; + + /** + * Flag indicating whether the RDFFormat can encode context information (ex: Graphs or quads). + */ + private final boolean supportsContexts; + + /** + * Flag indicating whether the RDFFormat can encode RDF-star triples natively. + */ + private final boolean supportsRDFStar; + + RDFFormats(String name, + List mimeTypes, + Charset charset, + List fileExtensions, + boolean supportsNamespaces, + boolean supportsContexts, + boolean supportsRDFStar) { + this.name = name; + this.mimeTypes = mimeTypes; + this.charset = charset; + this.fileExtensions = fileExtensions; + this.supportsNamespaces = supportsNamespaces; + this.supportsContexts = supportsContexts; + this.supportsRDFStar = supportsRDFStar; + } + + RDFFormats(String name, + List mimeTypes, + Charset charset, + List fileExtensions) { + this(name, mimeTypes, charset, fileExtensions, DEFAULT_SUPPORTS_NAMESPACES, DEFAULT_SUPPORTS_CONTEXTS, DEFAULT_SUPPORTS_RDF_STAR); + } + + RDFFormats(String name, + List mimeTypes, + List fileExtensions) { + this(name, mimeTypes, StandardCharsets.UTF_8, fileExtensions, DEFAULT_SUPPORTS_NAMESPACES, DEFAULT_SUPPORTS_CONTEXTS, DEFAULT_SUPPORTS_RDF_STAR); + } + + RDFFormats(String name, + List mimeTypes, + List fileExtensions, + boolean supportsNamespaces, + boolean supportsContexts, + boolean supportsRDFStar) { + this(name, mimeTypes, StandardCharsets.UTF_8, fileExtensions, supportsNamespaces, supportsContexts, supportsRDFStar); + } + + @Override + public String getName() { + return name; + } + + @Override + public String getDefaultMIMEType() { + return mimeTypes.get(0); + } + + @Override + public boolean hasDefaultMIMEType(String mimeType) { + return getDefaultMIMEType().equalsIgnoreCase(mimeType); + } + + @Override + public List getMIMETypes() { + return Collections.unmodifiableList(mimeTypes); + } + + @Override + public boolean hasMIMEType(String mimeType) { + if (mimeType == null) { + return false; + } + String type = mimeType; + if (mimeType.indexOf(';') > 0) { + type = mimeType.substring(0, mimeType.indexOf(';')); + } + for (String mt : this.mimeTypes) { + if (mt.equalsIgnoreCase(mimeType)) { + return true; + } + if (mimeType != type && mt.equalsIgnoreCase(type)) { + return true; + } + } + + return false; + } + + @Override + public String getDefaultFileExtension() { + if (fileExtensions.isEmpty()) { + return null; + } else { + return fileExtensions.get(0); + } + } + + @Override + public boolean hasDefaultFileExtension(String extension) { + String ext = getDefaultFileExtension(); + return ext != null && ext.equalsIgnoreCase(extension); + } + + @Override + public List getFileExtensions() { + return Collections.unmodifiableList(fileExtensions); + } + + @Override + public boolean hasFileExtension(String extension) { + for (String ext : fileExtensions) { + if (ext.equalsIgnoreCase(extension)) { + return true; + } + } + + return false; + } + + @Override + public Charset getCharset() { + return charset; + } + + @Override + public boolean hasCharset() { + return charset != null; + } + + @Override + public boolean supportsNamespaces() { + return supportsNamespaces; + } + + @Override + public boolean supportsContexts() { + return supportsContexts; + } + + @Override + public boolean supportsRDFStar() { + return supportsRDFStar; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(64); + + sb.append(name); + + sb.append(" (mimeTypes="); + for (int i = 0; i < mimeTypes.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(mimeTypes.get(i)); + } + + sb.append("; ext="); + for (int i = 0; i < fileExtensions.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(fileExtensions.get(i)); + } + + sb.append(")"); + + return sb.toString(); + } + +} diff --git a/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParser.java b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParser.java new file mode 100644 index 000000000..cf8da275c --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParser.java @@ -0,0 +1,42 @@ +package fr.inria.corese.core.next.api.base.parser; + +import java.io.InputStream; +import java.io.Reader; + +public interface RDFParser { + + /** + * Gets the RDF format that this parser can parse. + */ + RDFFormat getRDFFormat(); + + /** + * Parses RDF data from the specified InputStream or Reader and adds it to the model. + * + * @param in The InputStream to read RDF data from. + */ + void parse(InputStream in); + + /** + * Parses RDF data from the specified InputStream or Reader and adds it to the model. + * + * @param in The InputStream to read RDF data from. + * @param baseURI The base URI for resolving relative URIs in the RDF data. + */ + void parse(InputStream in, String baseURI); + + /** + * Parses RDF data from the specified InputStream or Reader and adds it to the model. + * + * @param reader The Reader to read RDF data from. + */ + void parse(Reader reader); + + /** + * Parses RDF data from the specified InputStream or Reader and adds it to the model. + * + * @param reader The Reader to read RDF data from. + * @param baseURI The base URI for resolving relative URIs in the RDF data. + */ + void parse(Reader reader, String baseURI); +} diff --git a/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParserFactory.java b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParserFactory.java new file mode 100644 index 000000000..9124ee5e8 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/api/base/parser/RDFParserFactory.java @@ -0,0 +1,17 @@ +package fr.inria.corese.core.next.api.base.parser; + +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.ValueFactory; + +public interface RDFParserFactory { + + /** + * Creates a new RDF parser for the specified format and model. + * + * @param format The RDF format to use for parsing. + * @param model The model to which the parsed data will be added. + * @return A new instance of an RDF parser for the specified format and model. + */ + RDFParser createRDFParser(RDFFormat format, Model model, ValueFactory factory); + +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/exception/ParsingErrorException.java b/src/main/java/fr/inria/corese/core/next/impl/exception/ParsingErrorException.java new file mode 100644 index 000000000..cc6bf4eee --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/exception/ParsingErrorException.java @@ -0,0 +1,18 @@ +package fr.inria.corese.core.next.impl.exception; + +public class ParsingErrorException extends RuntimeException { + + private static final long serialVersionUID = -2053549958572141648L; + + public ParsingErrorException(String message) { + super(message); + } + + public ParsingErrorException(String message, Throwable cause) { + super(message, cause); + } + + public ParsingErrorException(Throwable cause) { + super(cause); + } +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/exception/UnsupportedFileFormatException.java b/src/main/java/fr/inria/corese/core/next/impl/exception/UnsupportedFileFormatException.java new file mode 100644 index 000000000..b44687d51 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/exception/UnsupportedFileFormatException.java @@ -0,0 +1,19 @@ +package fr.inria.corese.core.next.impl.exception; + +public class UnsupportedFileFormatException extends Exception { + + private static final long serialVersionUID = 7963163989802143570L; + + public UnsupportedFileFormatException(String message) { + super(message); + } + + public UnsupportedFileFormatException(String message, Throwable cause) { + super(message, cause); + } + + public UnsupportedFileFormatException(Throwable cause) { + super(cause); + } + +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParser.java new file mode 100644 index 000000000..7fe167569 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParser.java @@ -0,0 +1,77 @@ +package fr.inria.corese.core.next.impl.io.parser.turtle; + +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.api.base.parser.RDFParser; +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.base.parser.RDFFormat; +import fr.inria.corese.core.next.api.base.parser.RDFFormats; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleLexer; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleParser; + +import org.antlr.v4.runtime.CharStream; +import org.antlr.v4.runtime.CharStreams; +import org.antlr.v4.runtime.CommonTokenStream; +import org.antlr.v4.runtime.tree.ParseTree; +import org.antlr.v4.runtime.tree.ParseTreeListener; +import org.antlr.v4.runtime.tree.ParseTreeWalker; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; + +public class ANTLRTurtleParser implements RDFParser { + + private final Model model; + private final RDFFormat format = RDFFormats.TURTLE; + private final ValueFactory factory; + + public ANTLRTurtleParser(Model model, ValueFactory factory) { + this.model = model; + this.factory = factory; + } + + @Override + public RDFFormat getRDFFormat() { + return format; + } + + @Override + public void parse(InputStream in) { + parse(new InputStreamReader(in), null); + } + + @Override + public void parse(InputStream in, String baseURI) { + parse(new InputStreamReader(in), baseURI); + } + + @Override + public void parse(Reader reader) { + parse(reader, null); + } + + /** + * We are using ANTLR4 lexer and parser + * @param reader The Reader to read RDF data from. + * @param baseURI The base URI for resolving relative URIs in the RDF data. + */ + @Override + public void parse(Reader reader, String baseURI) { + + try { + CharStream charStream = CharStreams.fromReader(reader); + TurtleLexer lexer = new TurtleLexer(charStream); + CommonTokenStream tokens = new CommonTokenStream(lexer); + TurtleParser parser = new TurtleParser(tokens); + ParseTreeWalker walker = new ParseTreeWalker(); + ParseTree tree = parser.turtleDoc(); + TurtleListenerImpl listener = new TurtleListenerImpl(model, baseURI, factory); + + walker.walk((ParseTreeListener) listener, tree); + + } catch (IOException e) { + throw new RuntimeException("Failed to parse Turtle RDF", e); + } + } +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImpl.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImpl.java new file mode 100644 index 000000000..ae3f28f95 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImpl.java @@ -0,0 +1,171 @@ +package fr.inria.corese.core.next.impl.io.parser.turtle; + +import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.impl.common.literal.XSD; +import fr.inria.corese.core.next.impl.common.vocabulary.RDF; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleBaseListener; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleParser; +import fr.inria.corese.core.next.impl.temp.CoreseAdaptedValueFactory; +import fr.inria.corese.core.next.impl.temp.CoreseIRI; +import fr.inria.corese.core.next.impl.temp.ModelNamespace; +import fr.inria.corese.core.next.impl.temp.literal.CoreseBNode; +import org.antlr.v4.runtime.ParserRuleContext; +import org.antlr.v4.runtime.tree.ErrorNode; +import org.antlr.v4.runtime.tree.TerminalNode; + +import java.util.HashMap; +import java.util.Map; + +public class TurtleListenerImpl extends TurtleBaseListener { + + private final Model model; + private String baseURI; + private final Map prefixMap = new HashMap<>(); + private final ValueFactory factory; + + private Resource currentSubject; + private IRI currentPredicate; + + public TurtleListenerImpl(Model model, String baseURI, ValueFactory factory) { + this.model = model; + this.baseURI = baseURI != null ? baseURI : ""; + this.factory = factory; + } + + public void exitPrefixID(TurtleParser.PrefixIDContext ctx) { + String prefix = ctx.PNAME_NS().getText(); + String iri = ctx.IRIREF().getText(); + prefix = prefix.substring(0, prefix.length() - 1); + iri = iri.substring(1, iri.length() - 1); + prefixMap.put(prefix, iri); + + Namespace ns = new ModelNamespace(prefix, iri); + model.setNamespace(prefix, iri); + } + + public void exitSparqlBase(TurtleParser.SparqlBaseContext ctx) { + String iri = ctx.IRIREF().getText(); + baseURI = iri.substring(1, iri.length() - 1); + } + + public void enterTriples(TurtleParser.TriplesContext ctx) { + currentSubject = extractSubject(ctx.subject()); + } + + public void enterVerb(TurtleParser.VerbContext ctx) { + currentPredicate = extractVerb(ctx); + } + + public void exitObject_(TurtleParser.Object_Context ctx) { + Value object = extractObject(ctx); + model.add(currentSubject, currentPredicate, object); + } + + private String resolveIRI(String raw) { + if (raw.startsWith("<") && raw.endsWith(">")) { + return raw.substring(1, raw.length() - 1); + } else if (raw.equals("a")) { + return RDF.type.getIRI().stringValue(); + } else if (raw.contains(":")) { + // Prefixed name (e.g., ex:predicate) + String[] parts = raw.split(":", 2); + String ns = prefixMap.get(parts[0]); + if (ns != null) { + return ns + parts[1]; + } else { + throw new IllegalArgumentException("Prefix not declared: " + parts[0]); + } + } else { + return baseURI + raw; + } + } + + private String stripQuotes(String text) { + if (text == null || text.length() < 2) return text; + if ((text.startsWith("\"") && text.endsWith("\"")) || + (text.startsWith("'''") && text.endsWith("'''")) || + (text.startsWith("\"\"\"") && text.endsWith("\"\"\""))) { + return text.substring(1, text.length() - 1); + } + return text; + } + + private Literal extractLiteral(TurtleParser.LiteralContext ctx) { + String label; + IRI datatype; + String lang; + + if (ctx.rdfLiteral() != null) { + if (ctx.rdfLiteral().iri() != null) { + datatype = factory.createIRI(resolveIRI(ctx.rdfLiteral().iri().getText())); + label = ctx.rdfLiteral().string().getText(); + return factory.createLiteral(stripQuotes(label), datatype); + } + if (ctx.rdfLiteral().LANGTAG() != null) { + lang = ctx.rdfLiteral().LANGTAG().getText().substring(1); + label = ctx.rdfLiteral().string().getText(); + return factory.createLiteral(stripQuotes(label), lang); + } + label = ctx.rdfLiteral().string().getText(); + return factory.createLiteral(stripQuotes(label)); + } + + if (ctx.BooleanLiteral() != null) { + label = ctx.BooleanLiteral().getText(); + datatype = XSD.BOOLEAN.getIRI(); + return factory.createLiteral(label, datatype); + } + if (ctx.numericLiteral() != null) { + if (ctx.numericLiteral().DECIMAL() != null) { + label = ctx.numericLiteral().DECIMAL().getText(); + datatype = XSD.DECIMAL.getIRI(); + return factory.createLiteral(label, datatype); + } + if (ctx.numericLiteral().DOUBLE() != null) { + label = ctx.numericLiteral().DOUBLE().getText(); + datatype = XSD.DOUBLE.getIRI(); + return factory.createLiteral(label, datatype); + } + if (ctx.numericLiteral().INTEGER() != null) { + label = ctx.numericLiteral().INTEGER().getText(); + datatype = XSD.INTEGER.getIRI(); + return factory.createLiteral(label, datatype); + } + } + throw new IllegalArgumentException("Unsupported literal type: " + ctx.getText()); + } + + private Value extractObject(TurtleParser.Object_Context ctx) { + if (ctx.iri() != null) { + return factory.createIRI(resolveIRI(ctx.iri().getText())); + } + if (ctx.BlankNode() != null) { + return factory.createBNode(ctx.BlankNode().getText()); + } + if (ctx.literal() != null) { + return extractLiteral(ctx.literal()); + } + throw new RuntimeException("Unsupported object: " + ctx.getText()); + } + + private Resource extractSubject(TurtleParser.SubjectContext ctx) { + if (ctx.iri() != null) { + return factory.createIRI(resolveIRI(ctx.iri().getText())); + } + if (ctx.BlankNode() != null) { + return factory.createBNode(ctx.BlankNode().getText()); + } + throw new RuntimeException("Unsupported subject: " + ctx.getText()); + } + + private IRI extractPredicate(TurtleParser.PredicateContext ctx) { + return factory.createIRI(resolveIRI(ctx.getText())); + } + + private IRI extractVerb(TurtleParser.VerbContext ctx) { + if (ctx.predicate() != null) { + return extractPredicate(ctx.predicate()); + } + else return factory.createIRI(resolveIRI(ctx.getText())); + } +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParserFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParserFactory.java new file mode 100644 index 000000000..1c9a9de66 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParserFactory.java @@ -0,0 +1,23 @@ +package fr.inria.corese.core.next.impl.io.parser.turtle; + +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.api.base.parser.RDFFormat; +import fr.inria.corese.core.next.api.base.parser.RDFFormats; +import fr.inria.corese.core.next.api.base.parser.RDFParser; +import fr.inria.corese.core.next.api.base.parser.RDFParserFactory; + +public class TurtleParserFactory implements RDFParserFactory { + + public TurtleParserFactory() { + super(); + } + + @Override + public RDFParser createRDFParser(RDFFormat format, Model model, ValueFactory factory) { + if (!format.equals(RDFFormats.TURTLE)) { + throw new IllegalArgumentException("Unsupported format : " + format); + } + return new ANTLRTurtleParser(model, factory); + } +} diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParserSpec.java b/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParserSpec.java new file mode 100644 index 000000000..6d8f12c83 --- /dev/null +++ b/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/ANTLRTurtleParserSpec.java @@ -0,0 +1,33 @@ +package fr.inria.corese.core.next.impl.io.parser.turtle; + +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.api.base.parser.RDFParser; +import fr.inria.corese.core.next.impl.temp.CoreseAdaptedValueFactory; +import fr.inria.corese.core.next.impl.temp.CoreseModel; +import org.junit.jupiter.api.Test; + +import java.io.StringReader; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class ANTLRTurtleParserSpec { + private Model parseFromString(String turtleData, String baseURI) throws Exception { + Model model = new CoreseModel(); + ValueFactory factory = new CoreseAdaptedValueFactory(); + RDFParser parser = new ANTLRTurtleParser(model, factory); + parser.parse(new StringReader(turtleData), baseURI); + return model; + } + + @Test + public void testParseWithPrefixAndTriple() throws Exception { + String turtle = " @prefix ex: . " + + "ex:Alice ex:knows ex:Bob ."; + + Model model = parseFromString(turtle, null); + assertEquals(1, model.size()); + assertEquals(1, model.getNamespaces().size()); + } + +} diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImplSpec.java b/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImplSpec.java new file mode 100644 index 000000000..3f7326452 --- /dev/null +++ b/src/test/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleListenerImplSpec.java @@ -0,0 +1,130 @@ +package fr.inria.corese.core.next.impl.io.parser.turtle; + +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleLexer; +import fr.inria.corese.core.next.impl.parser.antlr.TurtleParser; +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.impl.temp.CoreseAdaptedValueFactory; +import fr.inria.corese.core.next.impl.temp.CoreseModel; +import org.antlr.v4.runtime.CharStream; +import org.antlr.v4.runtime.CharStreams; +import org.antlr.v4.runtime.CommonTokenStream; +import org.antlr.v4.runtime.tree.ParseTree; +import org.antlr.v4.runtime.tree.ParseTreeListener; +import org.antlr.v4.runtime.tree.ParseTreeWalker; +import org.junit.jupiter.api.Test; + +import java.io.StringReader; + +import static org.junit.jupiter.api.Assertions.*; + +public class TurtleListenerImplSpec { + private Model parseAndPrintModel(String turtleData) throws Exception { + ValueFactory factory = new CoreseAdaptedValueFactory(); + + CharStream input = CharStreams.fromReader(new StringReader(turtleData)); + TurtleLexer lexer = new TurtleLexer(input); + CommonTokenStream tokens = new CommonTokenStream(lexer); + TurtleParser parser = new TurtleParser(tokens); + ParseTreeWalker walker = new ParseTreeWalker(); + ParseTree tree = parser.turtleDoc(); + + Model model = new CoreseModel(); + TurtleListenerImpl listener = new TurtleListenerImpl(model, null, factory); + walker.walk((ParseTreeListener) listener, tree); + + + /* + model.forEach(stmt -> { + System.out.println(stmt.getSubject().stringValue() + " " + + stmt.getPredicate().stringValue() + " " + + stmt.getObject().stringValue()); + }); + + */ + + + return model; + } + + @Test + public void testNamespace() throws Exception { + String turtleData = " @prefix ex: . " + + "ex:subject ex:predicate 1 . "; + + Model model = parseAndPrintModel(turtleData); + assertEquals(model.getNamespaces().size(), 1); + } + + @Test + public void testTypedLiteral() throws Exception { + String turtleData = "@prefix ex: .\n" + + "@prefix xsd: .\n" + + "ex:subject ex:age \"27\"^^xsd:integer ."; + + Model model = parseAndPrintModel(turtleData); + assertEquals(model.size(), 1); + assertEquals(model.getNamespaces().size(), 2); + + } + + @Test + public void testMultipleObjects() throws Exception { + String turtleData = "@prefix ex: .\n" + + "ex:subject ex:knows ex:Alice , ex:Bob ; ex:likes ex:Pizza ."; + + Model model = parseAndPrintModel(turtleData); + assertEquals(model.size(), 3); + assertEquals(model.getNamespaces().size(), 1); + + } + + @Test + public void testRDFtype() throws Exception { + String turtleData = "@prefix ex: .\n" + + "ex:Alice a ex:Person .\n" + + "ex:subject ex:knows ex:Alice , ex:Bob ; ex:likes ex:Pizza ."; + + Model model = parseAndPrintModel(turtleData); + assertEquals(model.size(), 4); + assertEquals(model.getNamespaces().size(), 1); + } + + @Test + public void testBaseIRI() throws Exception { + String turtleData = "@base .\n" + + "@prefix : .\n" + + "@prefix rdf: . \n" + + "\n" + + " rdf:type rdf:Property .\n" + + ":phone rdf:type rdf:Property ."; + + Model model = parseAndPrintModel(turtleData); + assertEquals(model.size(), 2); + assertEquals(model.getNamespaces().size(), 2); + } + + @Test + public void testTypedIntegerLiteral() throws Exception { + String turtleData = + "@prefix : .\n" + + "@prefix xsd: .\n" + + ":John :age \"42\"^^xsd:integer ."; + + Model model = parseAndPrintModel(turtleData); + model.objects().forEach(obj -> { + assertTrue(obj.isLiteral(), "Expected object to be a literal"); + // test if we can parse the literal to int. Should be ok + try { + int value = Integer.parseInt(obj.stringValue()); + System.out.println("Parsed integer: " + value); + } catch (NumberFormatException e) { + fail("Literal is not a valid integer: " + obj.stringValue()); + } + }); + + + assertEquals(model.size(), 1); + assertEquals(model.getNamespaces().size(), 2); + } +} \ No newline at end of file