From 3ca1d5d6be9245ab846d8c459a64727796317bdd Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Fri, 1 Aug 2025 16:50:30 +0200 Subject: [PATCH 1/6] #187 Implement RDF canonicalization as serializer --- .../core/next/api/base/io/RDFFormat.java | 7 + .../DefaultSerializerFactory.java | 15 ++ .../canonical/CanonicalOption.java | 77 ++++++ .../canonical/CanonicalSerializer.java | 253 ++++++++++++++++++ 4 files changed, 352 insertions(+) create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java diff --git a/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java b/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java index bcd84f5a6..d794271f4 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java @@ -59,6 +59,13 @@ public class RDFFormat extends FileFormat { true, true); + public static final RDFFormat CANONICAL_RDF = new RDFFormat( + "Canonical RDF", + List.of("crd", "nq"), + List.of("application/n-quads-canonical", "application/n-quads"), + false, + true); + /** * Constructs a new RDF format. * diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java index ed7e35d65..5fa02816c 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java @@ -1,10 +1,13 @@ package fr.inria.corese.core.next.impl.io.serialization; import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.ValueFactory; import fr.inria.corese.core.next.api.base.io.RDFFormat; import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; import fr.inria.corese.core.next.api.io.serialization.SerializerFactory; +import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalOption; +import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsOption; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesOption; @@ -15,6 +18,7 @@ import fr.inria.corese.core.next.impl.io.serialization.trig.TriGSerializer; import fr.inria.corese.core.next.impl.io.serialization.turtle.TurtleOption; import fr.inria.corese.core.next.impl.io.serialization.turtle.TurtleSerializer; +import fr.inria.corese.core.next.impl.temp.CoreseAdaptedValueFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +44,7 @@ public class DefaultSerializerFactory implements SerializerFactory { private static final Logger logger = LoggerFactory.getLogger(DefaultSerializerFactory.class); private final Map> registry; + ValueFactory coreseValueFactory = new CoreseAdaptedValueFactory(); /** * Constructs a {@code DefaultSerializerFactory} and populates its registry @@ -101,6 +106,16 @@ public DefaultSerializerFactory() { } }); + tempRegistry.put(RDFFormat.CANONICAL_RDF, (model, genericConfig) -> { + if (genericConfig instanceof CanonicalOption specificConfig) { + return new CanonicalSerializer(model, specificConfig, coreseValueFactory); + } else { + logger.warn("Provided config for CANONICAL_RDF is not CanonicalOption (was {}). Using default CanonicalOption.", + genericConfig.getClass().getSimpleName()); + return new CanonicalSerializer(model, CanonicalOption.defaultConfig(), coreseValueFactory); + } + }); + this.registry = Collections.unmodifiableMap(tempRegistry); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java new file mode 100644 index 000000000..96d5dcbcb --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java @@ -0,0 +1,77 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.impl.io.serialization.option.AbstractSerializerOption; + +/** + * Configuration for Canonical RDF serialization format. + * This class extends {@link AbstractSerializerOption} and provides specific defaults + * and options tailored for canonicalization. + * + *

It includes options relevant to blank node canonicalization, such as whether to + * include comments (which might interfere with strict canonicalization) or + * to ensure deterministic blank node labeling.

+ * + *

Use the {@link Builder} class to create instances of {@code CanonicalOption}. + * A predefined default configuration is available via {@link #defaultConfig()}.

+ */ +public class CanonicalOption extends AbstractSerializerOption { + + /** + * Protected constructor to be used by the {@link Builder}. + * + * @param builder The builder instance containing the desired configuration values. + */ + protected CanonicalOption(Builder builder) { + super(builder); + + } + + /** + * Public Builder for {@link CanonicalOption}. + * Provides a fluent API for constructing {@code CanonicalOption} instances with default values + * specific to the Canonical RDF format. + */ + public static class Builder extends AbstractSerializerOption.AbstractBuilder { + /** + * Default constructor initializes all options with their default values for Canonical RDF. + */ + public Builder() { + strictMode(true); + validateURIs(true); + escapeUnicode(true); + trailingDot(true); + includeContext(false); + } + + /** + * Builds and returns a new {@link CanonicalOption} instance with the current builder settings. + * + * @return A new {@code CanonicalOption} instance. + */ + @Override + public CanonicalOption build() { + return new CanonicalOption(this); + } + } + + /** + * Returns a default configuration suitable for Canonical RDF serialization. + * This provides a convenient way to get a standard Canonical RDF configuration without + * manually building it. + * + * @return A {@code CanonicalOption} instance with default settings. + */ + public static CanonicalOption defaultConfig() { + return new Builder().build(); + } + + /** + * Returns a new builder instance for {@link CanonicalOption}. + * This allows for fluent construction of custom Canonical RDF configurations. + * + * @return A new {@code Builder} instance. + */ + public static CanonicalOption.Builder builder() { + return new CanonicalOption.Builder(); + } +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java new file mode 100644 index 000000000..0fdecc197 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java @@ -0,0 +1,253 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.impl.exception.SerializationException; +import fr.inria.corese.core.next.impl.io.serialization.base.AbstractLineBasedSerializer; +import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Serializes a Corese {@link Model} into a canonical RDF format. + * This serializer ensures a deterministic output by re-labeling blank nodes + * and sorting all statements. The output format is similar to N-Quads, + * but with canonical blank node identifiers and a guaranteed order. + * + *

This implementation provides a simplified blank node canonicalization + * based on lexicographical sorting of blank node fingerprints, followed by + * re-labeling and sorting of all triples. It extends {@link AbstractLineBasedSerializer} + * to reuse common writing utilities but overrides the main {@code write} method + * to implement the canonicalization logic.

+ */ +public class CanonicalSerializer extends AbstractLineBasedSerializer { + + private static final Logger logger = LoggerFactory.getLogger(CanonicalSerializer.class); + private final ValueFactory valueFactory; + + /** + * Constructs a new {@code CanonicalSerializer} instance with the specified model and default configuration. + * The default configuration is obtained from {@link CanonicalOption#defaultConfig()}. + * + * @param model the {@link Model} to be serialized. Must not be null. + * @param valueFactory the {@link ValueFactory} to use for creating RDF elements. Must not be null. + * @throws NullPointerException if the provided model or valueFactory is null. + */ + public CanonicalSerializer(Model model, ValueFactory valueFactory) { + this(model, CanonicalOption.defaultConfig(), valueFactory); + } + + /** + * Constructs a new {@code CanonicalSerializer} instance with the specified model and custom configuration. + * + * @param model the {@link Model} to be serialized. Must not be null. + * @param config the {@link CanonicalOption} to use for serialization. Must not be null. + * @param valueFactory the {@link ValueFactory} to use for creating RDF elements. Must not be null. + * @throws NullPointerException if the provided model, config, or valueFactory is null. + */ + public CanonicalSerializer(Model model, CanonicalOption config, ValueFactory valueFactory) { + super(model, config); + this.valueFactory = Objects.requireNonNull(valueFactory, "ValueFactory cannot be null"); + Objects.requireNonNull(config, "CanonicalOption cannot be null"); + } + + /** + * Returns the format name for error messages and logging. + * + * @return "Canonical RDF" + */ + @Override + protected String getFormatName() { + return "Canonical RDF"; + } + + /** + * Writes the context (named graph) part of a statement. + * For Canonical RDF, contexts are included if present, following N-Quads style. + * + * @param writer the {@link Writer} to which the context will be written. + * @param stmt the {@link Statement} whose context should be written. + * @throws IOException if an I/O error occurs. + */ + @Override + protected void writeContext(Writer writer, Statement stmt) throws IOException { + Resource context = stmt.getContext(); + if (context != null) { + writer.write(SerializationConstants.SPACE); + writeValue(writer, context); + } + } + + /** + * Writes the model to the given writer in a canonical form. + * This involves: + * 1. Collecting all statements. + * 2. Identifying and re-labeling blank nodes deterministically. + * 3. Creating a new set of statements with canonical blank node IDs. + * 4. Sorting these canonical statements. + * 5. Writing each sorted statement line by line. + * + * @param writer the {@link Writer} to which the output will be written. + * @throws SerializationException if an I/O error occurs during writing or if invalid data is encountered. + */ + @Override + public void write(Writer writer) throws SerializationException { + try (BufferedWriter bufferedWriter = new BufferedWriter(writer)) { + List originalStatements = new ArrayList<>(); + model.forEach(originalStatements::add); + + Set blankNodes = new HashSet<>(); + for (Statement stmt : originalStatements) { + if (stmt.getSubject().isBNode()) { + blankNodes.add((BNode) stmt.getSubject()); + } + if (stmt.getObject().isBNode()) { + blankNodes.add((BNode) stmt.getObject()); + } + if (stmt.getContext() != null && stmt.getContext().isBNode()) { + blankNodes.add((BNode) stmt.getContext()); + } + } + + Map canonicalBNodeMap = createCanonicalBNodeMap(blankNodes, originalStatements); + + List canonicalStatements = new ArrayList<>(); + for (Statement originalStmt : originalStatements) { + Resource subject = (Resource) mapValue(originalStmt.getSubject(), canonicalBNodeMap); + IRI predicate = originalStmt.getPredicate(); + Value object = mapValue(originalStmt.getObject(), canonicalBNodeMap); + Resource context = (Resource) mapValue(originalStmt.getContext(), canonicalBNodeMap); + + canonicalStatements.add(valueFactory.createStatement(subject, predicate, object, context)); + } + + Collections.sort(canonicalStatements, new CanonicalStatementComparator(canonicalBNodeMap)); + + for (Statement stmt : canonicalStatements) { + writeCanonicalStatement(bufferedWriter, stmt); + } + + } catch (IOException e) { + throw new SerializationException(getFormatName() + " serialization failed", getFormatName(), e); + } catch (IllegalArgumentException e) { + throw new SerializationException("Invalid " + getFormatName() + " data: " + e.getMessage(), getFormatName(), e); + } + } + + /** + * Maps a value (Resource, Literal, IRI) to its canonical form if it's a blank node. + * + * @param value the original value. + * @param canonicalBNodeMap the map from original blank nodes to canonical blank nodes. + * @return the canonical value. + */ + private Value mapValue(Value value, Map canonicalBNodeMap) { + if (value != null && value.isBNode()) { + return canonicalBNodeMap.getOrDefault((BNode) value, (BNode) value); + } + return value; + } + + /** + * Creates a deterministic mapping from original blank node {@link BNode}s + * to new, canonical blank node {@link BNode}s. + * This simplified approach sorts blank nodes based on a string representation + * of their associated triples. + * + * @param blankNodes the set of all blank nodes in the model. + * @param statements the list of all statements in the model. + * @return a map from original blank node {@link BNode} to canonical blank node {@link BNode}. + */ + private Map createCanonicalBNodeMap(Set blankNodes, List statements) { + Map bNodeFingerprints = new HashMap<>(); + for (BNode bNode : blankNodes) { + List relatedTriples = statements.stream() + .filter(stmt -> stmt.getSubject().equals(bNode) || stmt.getObject().equals(bNode) || (stmt.getContext() != null && stmt.getContext().equals(bNode))) + .map(Statement::toString) + .sorted() + .collect(Collectors.toList()); + bNodeFingerprints.put(bNode, String.join("|", relatedTriples)); + } + + List sortedBNodes = new ArrayList<>(blankNodes); + sortedBNodes.sort(Comparator.comparing(bNodeFingerprints::get)); + + Map canonicalBNodeMap = new HashMap<>(); + int i = 0; + for (BNode bNode : sortedBNodes) { + canonicalBNodeMap.put(bNode, valueFactory.createBNode("b" + i)); + i++; + } + return canonicalBNodeMap; + } + + /** + * Writes a single canonical {@link Statement} to the writer. + * This method is similar to the private `writeStatement` in the superclass, + * but ensures it uses the canonicalized values. + * + * @param writer the {@link Writer} to which the statement will be written. + * @param stmt the {@link Statement} to write (already canonicalized). + * @throws IOException if an I/O error occurs. + */ + private void writeCanonicalStatement(Writer writer, Statement stmt) throws IOException { + writeValue(writer, stmt.getSubject()); + writer.write(SerializationConstants.SPACE); + writeValue(writer, stmt.getPredicate()); + writer.write(SerializationConstants.SPACE); + writeValue(writer, stmt.getObject()); + + writeContext(writer, stmt); + + if (config.trailingDot()) { + writer.write(SerializationConstants.SPACE); + writer.write(SerializationConstants.POINT); + } + + writer.write(config.getLineEnding()); + } + + /** + * A custom comparator for {@link Statement}s to ensure canonical ordering. + * This comparator sorts statements based on subject, then predicate, then object, then context. + * Blank nodes are compared using their canonical labels. + */ + private class CanonicalStatementComparator implements Comparator { + private final Map canonicalBNodeMap; + + public CanonicalStatementComparator(Map canonicalBNodeMap) { + this.canonicalBNodeMap = canonicalBNodeMap; + } + + @Override + public int compare(Statement s1, Statement s2) { + int cmp = compareValues(s1.getSubject(), s2.getSubject()); + if (cmp != 0) return cmp; + + cmp = compareValues(s1.getPredicate(), s2.getPredicate()); + if (cmp != 0) return cmp; + + cmp = compareValues(s1.getObject(), s2.getObject()); + if (cmp != 0) return cmp; + + return compareValues(s1.getContext(), s2.getContext()); + } + + private int compareValues(Value v1, Value v2) { + if (v1 == null && v2 == null) return 0; + if (v1 == null) return -1; + if (v2 == null) return 1; + + Value cV1 = v1.isBNode() ? canonicalBNodeMap.getOrDefault((BNode) v1, (BNode) v1) : v1; + Value cV2 = v2.isBNode() ? canonicalBNodeMap.getOrDefault((BNode) v2, (BNode) v2) : v2; + + + return cV1.stringValue().compareTo(cV2.stringValue()); + } + } +} From 38409a77c75efec833ebb7d097ce7f85c41b5a09 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 5 Aug 2025 11:00:01 +0200 Subject: [PATCH 2/6] test unitaire de l implement canonicalization as serializer --- .../DefaultSerializerFactoryTest.java | 13 + .../canonical/CanonicalOptionTest.java | 60 ++++ .../canonical/CanonicalSerializerTest.java | 330 ++++++++++++++++++ 3 files changed, 403 insertions(+) create mode 100644 src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java create mode 100644 src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java index cbc605952..f1b354a3d 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java @@ -4,6 +4,7 @@ import fr.inria.corese.core.next.api.base.io.RDFFormat; import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; +import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesSerializer; import fr.inria.corese.core.next.impl.io.serialization.rdfxml.XmlSerializer; @@ -98,6 +99,18 @@ void createSerializer_shouldReturnXmlSerializer_forRdfXmlFormat() { } } + @Test + @DisplayName("createSerializer should return CanonicalSerializer for CANONICAL_RDF format") + void createSerializer_shouldReturnCanonicalSerializer_forCanonicalRdfFormat() { + try (MockedConstruction mockedConstruction = mockConstruction(CanonicalSerializer.class)) { + RDFSerializer serializer = factory.createSerializer(RDFFormat.CANONICAL_RDF, mockModel, mockConfig); + + assertNotNull(serializer); + assertTrue(serializer instanceof CanonicalSerializer); + assertEquals(1, mockedConstruction.constructed().size(), "CanonicalSerializer constructor should be called once"); + } + } + @Test @DisplayName("createSerializer should throw NullPointerException for a null format") void createSerializer_shouldThrowNPE_forNullFormat() { diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java new file mode 100644 index 000000000..7cff3baf4 --- /dev/null +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java @@ -0,0 +1,60 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for the {@link CanonicalOption} class. + * This class verifies the default configuration and the builder functionality + * for the Canonical RDF serialization options. + */ +class CanonicalOptionTest { + + @Test + @DisplayName("defaultConfig should return an instance with expected default values") + void defaultConfig_shouldReturnExpectedValues() { + CanonicalOption config = CanonicalOption.defaultConfig(); + + assertNotNull(config, "Default config should not be null"); + assertTrue(config.isStrictMode(), "Default strictMode should be true for canonicalization"); + assertTrue(config.validateURIs(), "Default validateURIs should be true for canonicalization"); + assertTrue(config.escapeUnicode(), "Default escapeUnicode should be true for canonicalization"); + assertTrue(config.trailingDot(), "Default trailingDot should be true for canonicalization"); + assertFalse(config.includeContext(), "Default includeContext should be false for canonicalization (N-Triples like)"); + } + + @Test + @DisplayName("builder should allow setting custom options") + void builder_shouldAllowCustomOptions() { + CanonicalOption customConfig = CanonicalOption.builder() + .strictMode(false) + .validateURIs(false) + .escapeUnicode(false) + .trailingDot(false) + .includeContext(true) + .build(); + + assertNotNull(customConfig, "Custom config should not be null"); + assertFalse(customConfig.isStrictMode(), "Custom strictMode should be false"); + assertFalse(customConfig.validateURIs(), "Custom validateURIs should be false"); + assertFalse(customConfig.escapeUnicode(), "Custom escapeUnicode should be false"); + assertFalse(customConfig.trailingDot(), "Custom trailingDot should be false"); + assertTrue(customConfig.includeContext(), "Custom includeContext should be true"); + } + + @Test + @DisplayName("builder should use default values for un-set options") + void builder_shouldUseDefaultValues_forUnsetOptions() { + CanonicalOption config = CanonicalOption.builder() + .strictMode(false) + .build(); + + assertFalse(config.isStrictMode(), "strictMode should be overridden to false"); + assertTrue(config.validateURIs(), "validateURIs should remain default (true)"); + assertTrue(config.escapeUnicode(), "escapeUnicode should remain default (true)"); + assertTrue(config.trailingDot(), "trailingDot should remain default (true)"); + assertFalse(config.includeContext(), "includeContext should remain default (false)"); + } +} diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java new file mode 100644 index 000000000..f9f6ddf35 --- /dev/null +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java @@ -0,0 +1,330 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.impl.exception.SerializationException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import java.io.IOException; +import java.io.StringWriter; +import java.io.Writer; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.function.Consumer; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +/** + * Unit tests for the CanonicalSerializer class. + * These tests verify the canonicalization of blank nodes, the sorting of statements, + * and the correct serialization of the RDF model. + */ +class CanonicalSerializerTest { + + @Mock + private Model mockModel; + @Mock + private ValueFactory mockValueFactory; + + @Mock + private BNode mockBNode1; + @Mock + private BNode mockBNode2; + + @Mock + private BNode canonicalBNode1; + @Mock + private BNode canonicalBNode2; + + @Mock + private IRI mockIRI1; + @Mock + private IRI mockIRI2; + @Mock + private Literal mockLiteral1; + @Mock + private Literal mockLiteral2; + + private CanonicalSerializer serializer; + private CanonicalOption defaultConfig; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + defaultConfig = CanonicalOption.defaultConfig(); + + setupBasicMocks(); + + + serializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory) { + @Override + protected void writeValue(Writer w, Value v) throws IOException { + + if (v != null) { + w.write(v.stringValue()); + } + } + }; + } + + /** + * Configures the basic stringValue() and isBNode() behavior for all mock RDF elements. + * This ensures consistency across tests. + */ + private void setupBasicMocks() { + when(mockIRI1.stringValue()).thenReturn(""); + when(mockIRI2.stringValue()).thenReturn(""); + + when(mockLiteral1.stringValue()).thenReturn("\"literal1\""); + when(mockLiteral2.stringValue()).thenReturn("\"literal2\""); + + when(mockBNode1.stringValue()).thenReturn("_:originalBNode1"); + when(mockBNode2.stringValue()).thenReturn("_:originalBNode2"); + + when(canonicalBNode1.stringValue()).thenReturn("_:b0"); + when(canonicalBNode2.stringValue()).thenReturn("_:b1"); + + when(mockIRI1.isBNode()).thenReturn(false); + when(mockIRI2.isBNode()).thenReturn(false); + when(mockLiteral1.isBNode()).thenReturn(false); + when(mockLiteral2.isBNode()).thenReturn(false); + when(mockBNode1.isBNode()).thenReturn(true); + when(mockBNode2.isBNode()).thenReturn(true); + when(canonicalBNode1.isBNode()).thenReturn(true); + when(canonicalBNode2.isBNode()).thenReturn(true); + } + + /** + * Helper method to create a mock Statement with configured subject, predicate, object, and context. + * It also sets up the toString() behavior which is used by the serializer for blank node fingerprinting. + * + * @param subject The subject of the statement. + * @param predicate The predicate of the statement. + * @param object The object of the statement. + * @param context The context (named graph) of the statement, can be null. + * @return A mocked Statement. + */ + private Statement createMockStatement(Resource subject, IRI predicate, Value object, Resource context) { + Statement stmt = mock(Statement.class); + when(stmt.getSubject()).thenReturn(subject); + when(stmt.getPredicate()).thenReturn(predicate); + when(stmt.getObject()).thenReturn(object); + when(stmt.getContext()).thenReturn(context); + + String contextPart = (context != null) ? " " + context.stringValue() : ""; + String expectedToString = subject.stringValue() + " " + predicate.stringValue() + " " + object.stringValue() + contextPart; + + doReturn(expectedToString).when(stmt).toString(); + + return stmt; + } + + @Test + @DisplayName("Constructor with valid parameters should create an instance") + void testConstructorWithValidParameters() { + assertNotNull(serializer); + assertEquals("Canonical RDF", serializer.getFormatName()); + } + + @Test + @DisplayName("Constructor with null model should throw NullPointerException") + void testConstructorNullModel() { + assertThrows(NullPointerException.class, () -> + new CanonicalSerializer(null, defaultConfig, mockValueFactory)); + } + + @Test + @DisplayName("Constructor with null valueFactory should throw NullPointerException") + void testConstructorNullValueFactory() { + assertThrows(NullPointerException.class, () -> + new CanonicalSerializer(mockModel, defaultConfig, null)); + } + + @Test + @DisplayName("Constructor with null config should throw NullPointerException") + void testConstructorNullConfig() { + assertThrows(NullPointerException.class, () -> + new CanonicalSerializer(mockModel, null, mockValueFactory)); + } + + @Test + @DisplayName("Constructor with default configuration") + void testConstructorWithDefaultConfig() { + CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, mockValueFactory); + assertNotNull(defaultSerializer); + assertEquals("Canonical RDF", defaultSerializer.getFormatName()); + } + + @Test + @DisplayName("Serialization of an empty model") + void testSerializeEmptyModel() throws SerializationException { + + doAnswer(invocation -> { + return null; + }).when(mockModel).forEach(any()); + + StringWriter writer = new StringWriter(); + + serializer.write(writer); + + assertEquals("", writer.toString()); + } + + @Test + @DisplayName("Serialization with a simple statement without blank nodes") + void testSerializeSimpleStatement() throws SerializationException { + Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); + + doAnswer(invocation -> { + ((Consumer) invocation.getArgument(0)).accept(simpleStmt); + return null; + }).when(mockModel).forEach(any()); + + + when(mockValueFactory.createStatement(mockIRI1, mockIRI2, mockLiteral1, null)) + .thenReturn(simpleStmt); + + StringWriter writer = new StringWriter(); + + serializer.write(writer); + + String expectedOutput = " \"literal1\" .\n"; + assertEquals(expectedOutput, writer.toString()); + } + + @Test + @DisplayName("Serialization with blank nodes - canonicalization and output sorting") + void testSerializeWithBlankNodesAndOutputVerification() throws SerializationException { + + Statement originalStmt1 = createMockStatement(mockBNode1, mockIRI1, mockLiteral1, null); + Statement originalStmt2 = createMockStatement(mockIRI2, mockIRI1, mockBNode2, null); + Statement originalStmt3 = createMockStatement(mockBNode1, mockIRI2, mockIRI1, null); + + List statementsInModel = Arrays.asList(originalStmt3, originalStmt1, originalStmt2); + Collections.shuffle(statementsInModel, new Random(0)); + + doAnswer(invocation -> { + ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(0)); + ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(1)); + ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(2)); + return null; + }).when(mockModel).forEach(any()); + + + when(mockValueFactory.createBNode("b0")).thenReturn(canonicalBNode1); + when(mockValueFactory.createBNode("b1")).thenReturn(canonicalBNode2); + + + Statement canonicalStmtA = createMockStatement(canonicalBNode1, mockIRI1, mockLiteral1, null); + Statement canonicalStmtB = createMockStatement(mockIRI2, mockIRI1, canonicalBNode2, null); + Statement canonicalStmtC = createMockStatement(canonicalBNode1, mockIRI2, mockIRI1, null); + + + when(mockValueFactory.createStatement(any(), any(), any(), any())) + .thenReturn(canonicalStmtB, canonicalStmtA, canonicalStmtC); + + + StringWriter writer = new StringWriter(); + + serializer.write(writer); + + String expectedOutput = """ + _:b1 . + _:b0 "literal1" . + _:b0 . + """; + assertEquals(expectedOutput, writer.toString()); + + verify(mockValueFactory).createBNode("b0"); + verify(mockValueFactory).createBNode("b1"); + verify(mockValueFactory, times(3)).createStatement(any(), any(), any(), any()); + } + + @Test + @DisplayName("Serialization with context (named graph)") + void testSerializeWithContext() throws SerializationException { + Statement stmtWithContext = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, mockIRI1); + + doAnswer(invocation -> { + ((Consumer) invocation.getArgument(0)).accept(stmtWithContext); + return null; + }).when(mockModel).forEach(any()); + + + when(mockValueFactory.createStatement(mockIRI1, mockIRI2, mockLiteral1, mockIRI1)) + .thenReturn(stmtWithContext); + + + StringWriter writer = new StringWriter(); + + serializer.write(writer); + + String expectedOutput = " \"literal1\" .\n"; + assertEquals(expectedOutput, writer.toString()); + } + + @Test + @DisplayName("Test writeContext with null context") + void testWriteContextWithNullContext() throws IOException { + StringWriter writer = new StringWriter(); + Statement stmt = mock(Statement.class); + when(stmt.getContext()).thenReturn(null); + + serializer.writeContext(writer, stmt); + + assertEquals("", writer.toString()); + } + + @Test + @DisplayName("Test writeContext with non-null context") + void testWriteContextWithNonNullContext() throws IOException { + + StringWriter writer = new StringWriter(); + Statement stmt = mock(Statement.class); + when(stmt.getContext()).thenReturn(mockIRI1); + + + serializer.writeContext(writer, stmt); + + String expectedOutput = " "; + assertEquals(expectedOutput, writer.toString()); + } + + + @Test + @DisplayName("Serialization with blank nodes in context - canonicalization and sorting") + void testSerializeWithBlankNodeInContextAndOutputVerification() throws SerializationException { + + Statement originalStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, mockBNode1); + + List statementsInModel = Collections.singletonList(originalStmt1); + doAnswer(invocation -> { + ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(0)); + return null; + }).when(mockModel).forEach(any()); + + when(mockValueFactory.createBNode("b0")).thenReturn(canonicalBNode1); + + Statement canonicalStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, canonicalBNode1); + + when(mockValueFactory.createStatement(any(), any(), any(), any())) + .thenReturn(canonicalStmt1); + + StringWriter writer = new StringWriter(); + + + serializer.write(writer); + + String expectedOutput = " \"literal1\" _:b0 .\n"; + assertEquals(expectedOutput, writer.toString()); + + verify(mockValueFactory).createBNode("b0"); + verify(mockValueFactory, times(1)).createStatement(any(), any(), any(), any()); + } +} From 4583df9162a9189e531da7e2f6acda8877483a4a Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Thu, 28 Aug 2025 10:36:10 +0200 Subject: [PATCH 3/6] #187 Implement RDF canonicalization as serializer --- .../core/next/api/base/io/RDFFormat.java | 8 +- .../DefaultSerializerFactory.java | 31 +- .../canonical/CanonicalOption.java | 77 ---- .../canonical/CanonicalSerializer.java | 229 +++-------- .../canonical/Rdfc10Canonicalizer.java | 24 ++ .../canonical/Rdfc10CanonicalizerImpl.java | 354 ++++++++++++++++++ .../serialization/option/CanonicalOption.java | 73 ++++ .../util/SerializationConstants.java | 12 + .../io/serialization/util/StatementUtils.java | 117 ++++++ .../DefaultSerializerFactoryTest.java | 2 +- .../canonical/CanonicalOptionTest.java | 5 +- .../canonical/CanonicalSerializerTest.java | 345 ++++++++++++----- 12 files changed, 897 insertions(+), 380 deletions(-) delete mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java create mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java diff --git a/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java b/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java index d794271f4..e16c3a92c 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java @@ -59,10 +59,10 @@ public class RDFFormat extends FileFormat { true, true); - public static final RDFFormat CANONICAL_RDF = new RDFFormat( - "Canonical RDF", - List.of("crd", "nq"), - List.of("application/n-quads-canonical", "application/n-quads"), + public static final RDFFormat RDFC_1_0 = new RDFFormat( + "RDFC-1.0", + List.of("nq"), + List.of("application/n-quads", "application/n-quads"), false, true); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java index 5fa02816c..75ab94cc8 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java @@ -6,8 +6,10 @@ import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; import fr.inria.corese.core.next.api.io.serialization.SerializerFactory; -import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalOption; +import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; +import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Canonicalizer; +import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10CanonicalizerImpl; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsOption; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesOption; @@ -44,7 +46,7 @@ public class DefaultSerializerFactory implements SerializerFactory { private static final Logger logger = LoggerFactory.getLogger(DefaultSerializerFactory.class); private final Map> registry; - ValueFactory coreseValueFactory = new CoreseAdaptedValueFactory(); + private final ValueFactory coreseValueFactory; /** * Constructs a {@code DefaultSerializerFactory} and populates its registry @@ -54,6 +56,8 @@ public class DefaultSerializerFactory implements SerializerFactory { * it falls back to the format's default configuration. */ public DefaultSerializerFactory() { + this.coreseValueFactory = new CoreseAdaptedValueFactory(); + Map> tempRegistry = new HashMap<>(); tempRegistry.put(RDFFormat.TURTLE, (model, genericConfig) -> { @@ -106,13 +110,24 @@ public DefaultSerializerFactory() { } }); - tempRegistry.put(RDFFormat.CANONICAL_RDF, (model, genericConfig) -> { + tempRegistry.put(RDFFormat.RDFC_1_0, (model, genericConfig) -> { if (genericConfig instanceof CanonicalOption specificConfig) { - return new CanonicalSerializer(model, specificConfig, coreseValueFactory); + Rdfc10Canonicalizer canonicalizer = new Rdfc10CanonicalizerImpl( + specificConfig.getHashAlgorithm(), + specificConfig.getPermutationLimit(), + coreseValueFactory + ); + return new CanonicalSerializer(model, specificConfig, coreseValueFactory, canonicalizer); } else { - logger.warn("Provided config for CANONICAL_RDF is not CanonicalOption (was {}). Using default CanonicalOption.", - genericConfig.getClass().getSimpleName()); - return new CanonicalSerializer(model, CanonicalOption.defaultConfig(), coreseValueFactory); + logger.warn("Provided config for RDFC_1_0 is not CanonicalOption (was {}). Using default CanonicalOption.", + genericConfig != null ? genericConfig.getClass().getSimpleName() : "null"); + CanonicalOption defaultConfig = CanonicalOption.defaultConfig(); + Rdfc10Canonicalizer canonicalizer = new Rdfc10CanonicalizerImpl( + defaultConfig.getHashAlgorithm(), + defaultConfig.getPermutationLimit(), + coreseValueFactory + ); + return new CanonicalSerializer(model, defaultConfig, coreseValueFactory, canonicalizer); } }); @@ -144,4 +159,4 @@ public RDFSerializer createSerializer(RDFFormat format, Model model, Serializati return constructor.apply(model, config); } -} +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java deleted file mode 100644 index 96d5dcbcb..000000000 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java +++ /dev/null @@ -1,77 +0,0 @@ -package fr.inria.corese.core.next.impl.io.serialization.canonical; - -import fr.inria.corese.core.next.impl.io.serialization.option.AbstractSerializerOption; - -/** - * Configuration for Canonical RDF serialization format. - * This class extends {@link AbstractSerializerOption} and provides specific defaults - * and options tailored for canonicalization. - * - *

It includes options relevant to blank node canonicalization, such as whether to - * include comments (which might interfere with strict canonicalization) or - * to ensure deterministic blank node labeling.

- * - *

Use the {@link Builder} class to create instances of {@code CanonicalOption}. - * A predefined default configuration is available via {@link #defaultConfig()}.

- */ -public class CanonicalOption extends AbstractSerializerOption { - - /** - * Protected constructor to be used by the {@link Builder}. - * - * @param builder The builder instance containing the desired configuration values. - */ - protected CanonicalOption(Builder builder) { - super(builder); - - } - - /** - * Public Builder for {@link CanonicalOption}. - * Provides a fluent API for constructing {@code CanonicalOption} instances with default values - * specific to the Canonical RDF format. - */ - public static class Builder extends AbstractSerializerOption.AbstractBuilder { - /** - * Default constructor initializes all options with their default values for Canonical RDF. - */ - public Builder() { - strictMode(true); - validateURIs(true); - escapeUnicode(true); - trailingDot(true); - includeContext(false); - } - - /** - * Builds and returns a new {@link CanonicalOption} instance with the current builder settings. - * - * @return A new {@code CanonicalOption} instance. - */ - @Override - public CanonicalOption build() { - return new CanonicalOption(this); - } - } - - /** - * Returns a default configuration suitable for Canonical RDF serialization. - * This provides a convenient way to get a standard Canonical RDF configuration without - * manually building it. - * - * @return A {@code CanonicalOption} instance with default settings. - */ - public static CanonicalOption defaultConfig() { - return new Builder().build(); - } - - /** - * Returns a new builder instance for {@link CanonicalOption}. - * This allows for fluent construction of custom Canonical RDF configurations. - * - * @return A new {@code Builder} instance. - */ - public static CanonicalOption.Builder builder() { - return new CanonicalOption.Builder(); - } -} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java index 0fdecc197..8738ef789 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java @@ -1,132 +1,62 @@ package fr.inria.corese.core.next.impl.io.serialization.canonical; -import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.Resource; +import fr.inria.corese.core.next.api.Statement; +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.impl.exception.SerializationException; import fr.inria.corese.core.next.impl.io.serialization.base.AbstractLineBasedSerializer; +import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedWriter; import java.io.IOException; import java.io.Writer; -import java.util.*; -import java.util.stream.Collectors; +import java.util.List; +import java.util.Objects; /** - * Serializes a Corese {@link Model} into a canonical RDF format. - * This serializer ensures a deterministic output by re-labeling blank nodes - * and sorting all statements. The output format is similar to N-Quads, - * but with canonical blank node identifiers and a guaranteed order. + * Serializes a Corese {@link Model} into an RDFC-1.0 canonical RDF format. + * This serializer is designed to integrate with the W3C RDFC-1.0 algorithm + * to ensure a deterministic output by re-labeling blank nodes and sorting all statements + * according to the specification. The output format is canonicalized N-Quads. * - *

This implementation provides a simplified blank node canonicalization - * based on lexicographical sorting of blank node fingerprints, followed by - * re-labeling and sorting of all triples. It extends {@link AbstractLineBasedSerializer} - * to reuse common writing utilities but overrides the main {@code write} method - * to implement the canonicalization logic.

+ * This implementation now acts as a wrapper, preparing the model for a dedicated + * RDFC-1.0 canonicalization component and then writing the resulting canonical statements. */ -public class CanonicalSerializer extends AbstractLineBasedSerializer { +public class CanonicalSerializer extends AbstractLineBasedSerializer implements RDFSerializer { - private static final Logger logger = LoggerFactory.getLogger(CanonicalSerializer.class); private final ValueFactory valueFactory; + private final CanonicalOption config; + private final Rdfc10Canonicalizer canonicalizer; + private final Model model; /** - * Constructs a new {@code CanonicalSerializer} instance with the specified model and default configuration. - * The default configuration is obtained from {@link CanonicalOption#defaultConfig()}. + * Constructs a new CanonicalSerializer. + * This constructor is now adapted to be used by the DefaultSerializerFactory. * - * @param model the {@link Model} to be serialized. Must not be null. - * @param valueFactory the {@link ValueFactory} to use for creating RDF elements. Must not be null. - * @throws NullPointerException if the provided model or valueFactory is null. + * @param model The model to be serialized. + * @param config The configuration options for the canonicalization process. + * @param valueFactory The factory for creating RDF values. + * @param canonicalizer The canonicalizer component to use. */ - public CanonicalSerializer(Model model, ValueFactory valueFactory) { - this(model, CanonicalOption.defaultConfig(), valueFactory); - } - - /** - * Constructs a new {@code CanonicalSerializer} instance with the specified model and custom configuration. - * - * @param model the {@link Model} to be serialized. Must not be null. - * @param config the {@link CanonicalOption} to use for serialization. Must not be null. - * @param valueFactory the {@link ValueFactory} to use for creating RDF elements. Must not be null. - * @throws NullPointerException if the provided model, config, or valueFactory is null. - */ - public CanonicalSerializer(Model model, CanonicalOption config, ValueFactory valueFactory) { + public CanonicalSerializer(Model model, CanonicalOption config, ValueFactory valueFactory, Rdfc10Canonicalizer canonicalizer) { super(model, config); - this.valueFactory = Objects.requireNonNull(valueFactory, "ValueFactory cannot be null"); - Objects.requireNonNull(config, "CanonicalOption cannot be null"); - } - - /** - * Returns the format name for error messages and logging. - * - * @return "Canonical RDF" - */ - @Override - protected String getFormatName() { - return "Canonical RDF"; + this.model = Objects.requireNonNull(model); + this.valueFactory = Objects.requireNonNull(valueFactory); + this.config = Objects.requireNonNull(config); + this.canonicalizer = Objects.requireNonNull(canonicalizer); } - /** - * Writes the context (named graph) part of a statement. - * For Canonical RDF, contexts are included if present, following N-Quads style. - * - * @param writer the {@link Writer} to which the context will be written. - * @param stmt the {@link Statement} whose context should be written. - * @throws IOException if an I/O error occurs. - */ @Override - protected void writeContext(Writer writer, Statement stmt) throws IOException { - Resource context = stmt.getContext(); - if (context != null) { - writer.write(SerializationConstants.SPACE); - writeValue(writer, context); - } + public String getFormatName() { + return "RDFC-1.0"; } - /** - * Writes the model to the given writer in a canonical form. - * This involves: - * 1. Collecting all statements. - * 2. Identifying and re-labeling blank nodes deterministically. - * 3. Creating a new set of statements with canonical blank node IDs. - * 4. Sorting these canonical statements. - * 5. Writing each sorted statement line by line. - * - * @param writer the {@link Writer} to which the output will be written. - * @throws SerializationException if an I/O error occurs during writing or if invalid data is encountered. - */ - @Override - public void write(Writer writer) throws SerializationException { + public void serialize(Writer writer) { try (BufferedWriter bufferedWriter = new BufferedWriter(writer)) { - List originalStatements = new ArrayList<>(); - model.forEach(originalStatements::add); - - Set blankNodes = new HashSet<>(); - for (Statement stmt : originalStatements) { - if (stmt.getSubject().isBNode()) { - blankNodes.add((BNode) stmt.getSubject()); - } - if (stmt.getObject().isBNode()) { - blankNodes.add((BNode) stmt.getObject()); - } - if (stmt.getContext() != null && stmt.getContext().isBNode()) { - blankNodes.add((BNode) stmt.getContext()); - } - } - - Map canonicalBNodeMap = createCanonicalBNodeMap(blankNodes, originalStatements); - - List canonicalStatements = new ArrayList<>(); - for (Statement originalStmt : originalStatements) { - Resource subject = (Resource) mapValue(originalStmt.getSubject(), canonicalBNodeMap); - IRI predicate = originalStmt.getPredicate(); - Value object = mapValue(originalStmt.getObject(), canonicalBNodeMap); - Resource context = (Resource) mapValue(originalStmt.getContext(), canonicalBNodeMap); - - canonicalStatements.add(valueFactory.createStatement(subject, predicate, object, context)); - } - - Collections.sort(canonicalStatements, new CanonicalStatementComparator(canonicalBNodeMap)); + List canonicalStatements = canonicalizer.canonicalize(model); for (Statement stmt : canonicalStatements) { writeCanonicalStatement(bufferedWriter, stmt); @@ -139,57 +69,19 @@ public void write(Writer writer) throws SerializationException { } } - /** - * Maps a value (Resource, Literal, IRI) to its canonical form if it's a blank node. - * - * @param value the original value. - * @param canonicalBNodeMap the map from original blank nodes to canonical blank nodes. - * @return the canonical value. - */ - private Value mapValue(Value value, Map canonicalBNodeMap) { - if (value != null && value.isBNode()) { - return canonicalBNodeMap.getOrDefault((BNode) value, (BNode) value); - } - return value; - } - - /** - * Creates a deterministic mapping from original blank node {@link BNode}s - * to new, canonical blank node {@link BNode}s. - * This simplified approach sorts blank nodes based on a string representation - * of their associated triples. - * - * @param blankNodes the set of all blank nodes in the model. - * @param statements the list of all statements in the model. - * @return a map from original blank node {@link BNode} to canonical blank node {@link BNode}. - */ - private Map createCanonicalBNodeMap(Set blankNodes, List statements) { - Map bNodeFingerprints = new HashMap<>(); - for (BNode bNode : blankNodes) { - List relatedTriples = statements.stream() - .filter(stmt -> stmt.getSubject().equals(bNode) || stmt.getObject().equals(bNode) || (stmt.getContext() != null && stmt.getContext().equals(bNode))) - .map(Statement::toString) - .sorted() - .collect(Collectors.toList()); - bNodeFingerprints.put(bNode, String.join("|", relatedTriples)); - } - - List sortedBNodes = new ArrayList<>(blankNodes); - sortedBNodes.sort(Comparator.comparing(bNodeFingerprints::get)); - - Map canonicalBNodeMap = new HashMap<>(); - int i = 0; - for (BNode bNode : sortedBNodes) { - canonicalBNodeMap.put(bNode, valueFactory.createBNode("b" + i)); - i++; + @Override + protected void writeContext(Writer writer, Statement stmt) throws IOException { + Resource context = stmt.getContext(); + if (context != null) { + writer.write(SerializationConstants.SPACE); + writeValue(writer, context); } - return canonicalBNodeMap; } /** * Writes a single canonical {@link Statement} to the writer. - * This method is similar to the private `writeStatement` in the superclass, - * but ensures it uses the canonicalized values. + * This method is designed to write a statement that has already been processed + * by the RDFC-1.0 canonicalization algorithm. * * @param writer the {@link Writer} to which the statement will be written. * @param stmt the {@link Statement} to write (already canonicalized). @@ -212,42 +104,5 @@ private void writeCanonicalStatement(Writer writer, Statement stmt) throws IOExc writer.write(config.getLineEnding()); } - /** - * A custom comparator for {@link Statement}s to ensure canonical ordering. - * This comparator sorts statements based on subject, then predicate, then object, then context. - * Blank nodes are compared using their canonical labels. - */ - private class CanonicalStatementComparator implements Comparator { - private final Map canonicalBNodeMap; - - public CanonicalStatementComparator(Map canonicalBNodeMap) { - this.canonicalBNodeMap = canonicalBNodeMap; - } - - @Override - public int compare(Statement s1, Statement s2) { - int cmp = compareValues(s1.getSubject(), s2.getSubject()); - if (cmp != 0) return cmp; - - cmp = compareValues(s1.getPredicate(), s2.getPredicate()); - if (cmp != 0) return cmp; - - cmp = compareValues(s1.getObject(), s2.getObject()); - if (cmp != 0) return cmp; - return compareValues(s1.getContext(), s2.getContext()); - } - - private int compareValues(Value v1, Value v2) { - if (v1 == null && v2 == null) return 0; - if (v1 == null) return -1; - if (v2 == null) return 1; - - Value cV1 = v1.isBNode() ? canonicalBNodeMap.getOrDefault((BNode) v1, (BNode) v1) : v1; - Value cV2 = v2.isBNode() ? canonicalBNodeMap.getOrDefault((BNode) v2, (BNode) v2) : v2; - - - return cV1.stringValue().compareTo(cV2.stringValue()); - } - } -} +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java new file mode 100644 index 000000000..f538f5476 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java @@ -0,0 +1,24 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.api.Statement; +import fr.inria.corese.core.next.api.Model; + +import java.util.List; + +/** + * Interface for a component that performs RDFC-1.0 canonicalization. + * This component is responsible for re-labeling blank nodes and sorting statements + * according to the RDFC-1.0 specification. + */ +public interface Rdfc10Canonicalizer { + /** + * Canonicalizes a stream of RDF statements from a given model. + * The implementation will handle all steps of the RDFC-10 algorithm, + * including dataset normalization, blank node identification, and + * deterministic sorting. + * + * @param model The input model to canonicalize. + * @return A list of canonicalized and sorted statements. + */ + List canonicalize(Model model); +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java new file mode 100644 index 000000000..8a29fc2f3 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java @@ -0,0 +1,354 @@ +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.Statement; +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.impl.exception.SerializationException; +import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; +import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; +import fr.inria.corese.core.next.impl.io.serialization.util.StatementUtils; + +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; +import java.util.stream.Stream; + +/** + * Implementation of the RDFC-1.0 canonicalization algorithm as specified by W3C. + */ +public class Rdfc10CanonicalizerImpl implements Rdfc10Canonicalizer { + + private final CanonicalOption.HashAlgorithm hashAlgorithm; + private final int maxCallsHashNDegreeQuads; + private final ValueFactory valueFactory; + private final StatementUtils statementUtils; + private int callsHashNDegreeQuads = 0; + + public Rdfc10CanonicalizerImpl(CanonicalOption.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { + this.hashAlgorithm = hashAlgorithm; + this.maxCallsHashNDegreeQuads = maxCalls; + this.valueFactory = valueFactory; + this.statementUtils = new StatementUtils(valueFactory); + } + + + @Override + public List canonicalize(Model model) { + return canonicalize(model.stream()); + } + + + /** + * Internal canonicalization method + */ + private List canonicalize(Stream statements) { + List stmtList = statements.toList(); + + callsHashNDegreeQuads = 0; + + Map> blankNodeToQuads = createBNodeToQuadsMap(stmtList); + + if (blankNodeToQuads.isEmpty()) { + return stmtList.stream() + .sorted((s1, s2) -> StatementUtils.toNQuad(s1).compareTo(StatementUtils.toNQuad(s2))) + .toList(); + } + + Map canonicalReplacementMap = createCanonicalMap(blankNodeToQuads); + + return replaceBlankNodesAndSort(stmtList, canonicalReplacementMap); + } + + /** + * Add validation in createBNodeToQuadsMap + */ + private Map> createBNodeToQuadsMap(List statements) { + Map> blankNodeToQuads = new HashMap<>(); + + for (Statement stmt : statements) { + if (stmt == null) continue; + + if (StatementUtils.isBlankNode(stmt.getSubject())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getSubject()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + + if (StatementUtils.isBlankNode(stmt.getObject())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getObject()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + + if (stmt.getContext() != null && StatementUtils.isBlankNode(stmt.getContext())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getContext()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + } + + return blankNodeToQuads; + } + + /** + * Create canonical replacement map + */ + private Map createCanonicalMap(Map> blankNodeToQuads) { + Map canonicalIssuer = new HashMap<>(); + int canonicalCounter = 0; + + Map> hashToBlankNodes = new HashMap<>(); + + for (String blankNode : blankNodeToQuads.keySet()) { + String hash = hashFirstDegreeQuads(blankNode, blankNodeToQuads); + hashToBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(blankNode); + } + + List sortedHashes = new ArrayList<>(hashToBlankNodes.keySet()); + Collections.sort(sortedHashes); + + for (String hash : sortedHashes) { + Set blankNodes = hashToBlankNodes.get(hash); + + if (blankNodes.size() == 1) { + String blankNode = blankNodes.iterator().next(); + canonicalIssuer.put(blankNode, SerializationConstants.C14N + canonicalCounter++); + } else { + Map nDegreeHashes = new HashMap<>(); + + for (String blankNode : blankNodes) { + if (!canonicalIssuer.containsKey(blankNode)) { + TemporaryIssuer temporaryIssuer = new TemporaryIssuer(); + String nDegreeHash = hashNDegreQuads(blankNode, blankNodeToQuads, canonicalIssuer, temporaryIssuer); + nDegreeHashes.put(blankNode, nDegreeHash); + } + } + + List> sortedEntries = nDegreeHashes.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .toList(); + + for (Map.Entry entry : sortedEntries) { + if (!canonicalIssuer.containsKey(entry.getKey())) { + canonicalIssuer.put(entry.getKey(), SerializationConstants.C14N + canonicalCounter++); + } + } + } + } + + return canonicalIssuer; + } + + /** + * Hash First Degree Quads algorithm + */ + private String hashFirstDegreeQuads(String blankNode, Map> blankNodeToQuads) { + Set quads = blankNodeToQuads.get(blankNode); + List nquads = new ArrayList<>(); + + for (Statement quad : quads) { + String nquad = quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + nquads.add(nquad); + } + + Collections.sort(nquads); + String toHash = String.join(SerializationConstants.EMPTY_STRING, nquads); + return hash(toHash); + } + + /** + * Hash N-Degree Quads algorithm + */ + private String hashNDegreQuads(String identifier, Map> blankNodeToQuads, + Map canonicalIssuer, TemporaryIssuer issuer) { + + if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { + throw new SerializationException("Maximum calls to Hash N-Degree Quads exceeded: " + maxCallsHashNDegreeQuads, "Rdfc10CanonicalizerImpl"); + } + + Map> hashToRelatedBlankNodes = new HashMap<>(); + Set quads = blankNodeToQuads.get(identifier); + + for (Statement quad : quads) { + Set relatedBlankNodes = getRelatedBlankNodes(quad, identifier); + + for (String relatedBlankNode : relatedBlankNodes) { + String hash; + if (canonicalIssuer.containsKey(relatedBlankNode)) { + hash = canonicalIssuer.get(relatedBlankNode); + } else if (issuer.hasIssued(relatedBlankNode)) { + hash = issuer.issue(relatedBlankNode); + } else { + hash = hashFirstDegreeQuads(relatedBlankNode, blankNodeToQuads); + } + hashToRelatedBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(relatedBlankNode); + } + } + + StringBuilder dataToHash = new StringBuilder(); + List sortedHashes = new ArrayList<>(hashToRelatedBlankNodes.keySet()); + Collections.sort(sortedHashes); + + for (String hash : sortedHashes) { + dataToHash.append(hash); + Set blankNodeList = hashToRelatedBlankNodes.get(hash); + + if (blankNodeList.size() > 1) { + List hashPathList = new ArrayList<>(); + + for (String relatedBlankNode : blankNodeList) { + if (canonicalIssuer.containsKey(relatedBlankNode)) { + hashPathList.add(canonicalIssuer.get(relatedBlankNode)); + } else { + TemporaryIssuer tempIssuer = issuer.copy(); + tempIssuer.issue(relatedBlankNode); + String hashPath = hashNDegreQuads(relatedBlankNode, blankNodeToQuads, canonicalIssuer, tempIssuer); + hashPathList.add(hashPath); + } + } + + Collections.sort(hashPathList); + dataToHash.append(String.join(SerializationConstants.EMPTY_STRING, hashPathList)); + } else { + String blankNode = blankNodeList.iterator().next(); + if (canonicalIssuer.containsKey(blankNode)) { + dataToHash.append(canonicalIssuer.get(blankNode)); + } else { + dataToHash.append(issuer.issue(blankNode)); + } + } + } + + return hash(dataToHash.toString()); + } + + + /** + * Convert a quad to N-Quad format for hashing + */ + private String quadToNQuad(Statement quad, String blankNode, String replacement) { + StringBuilder sb = new StringBuilder(); + + if (StatementUtils.isBlankNode(quad.getSubject()) && StatementUtils.getBlankNodeId(quad.getSubject()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getSubject())); + } + sb.append(SerializationConstants.SPACE); + + sb.append(StatementUtils.serializeForComparison(quad.getPredicate())); + sb.append(SerializationConstants.SPACE); + + if (StatementUtils.isBlankNode(quad.getObject()) && StatementUtils.getBlankNodeId(quad.getObject()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getObject())); + } + + if (quad.getContext() != null) { + sb.append(SerializationConstants.SPACE); + if (StatementUtils.isBlankNode(quad.getContext()) && StatementUtils.getBlankNodeId(quad.getContext()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getContext())); + } + } + + sb.append(SerializationConstants.SPACE_POINT); + return sb.toString(); + } + + /** + * Get related blank nodes from a quad + */ + private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode) { + Set relatedBlankNodes = new HashSet<>(); + + if (StatementUtils.isBlankNode(quad.getSubject())) { + String id = StatementUtils.getBlankNodeId(quad.getSubject()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + if (StatementUtils.isBlankNode(quad.getObject())) { + String id = StatementUtils.getBlankNodeId(quad.getObject()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + if (quad.getContext() != null && StatementUtils.isBlankNode(quad.getContext())) { + String id = StatementUtils.getBlankNodeId(quad.getContext()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + return relatedBlankNodes; + } + + /** + * Improved blank node replacement with validation + */ + private List replaceBlankNodesAndSort(List statements, Map replacementMap) { + return statements.stream() + .map(stmt -> { + Statement replaced = statementUtils.replaceBlankNodes(stmt, replacementMap); + if (replaced == null) { + throw new IllegalStateException("Failed to replace blank nodes in statement: " + stmt); + } + return replaced; + }) + .sorted(Comparator.comparing(StatementUtils::toNQuad)) + .toList(); + } + + /** + * Utility methods - removed duplicates, now using StatementUtils + */ + private String hash(String data) { + try { + String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? SerializationConstants.SHA_384 : SerializationConstants.SHA_256; + MessageDigest digest = MessageDigest.getInstance(algorithm); + byte[] hash = digest.digest(data.getBytes(StandardCharsets.UTF_8)); + return bytesToHex(hash); + } catch (NoSuchAlgorithmException e) { + throw new SerializationException("Hash algorithm not available: " + e.getMessage(), "Rdfc10CanonicalizerImpl", e); + } catch (Exception e) { + throw new SerializationException("Hash computation failed for data: " + data, "Rdfc10CanonicalizerImpl", e); + } + } + + private String bytesToHex(byte[] bytes) { + StringBuilder result = new StringBuilder(); + for (byte b : bytes) { + result.append(String.format(SerializationConstants.HEX_FORMAT, b)); + } + return result.toString(); + } + + /** + * Helper class for temporary identifier issuing during canonicalization + */ + private static class TemporaryIssuer { + private Map issued = new HashMap<>(); + private int counter = 0; + + public String issue(String identifier) { + if (!issued.containsKey(identifier)) { + issued.put(identifier, SerializationConstants.CANONICAL_BNODE_PREFIX + counter++); + } + return issued.get(identifier); + } + + public boolean hasIssued(String identifier) { + return issued.containsKey(identifier); + } + + public TemporaryIssuer copy() { + TemporaryIssuer copy = new TemporaryIssuer(); + copy.issued = new HashMap<>(this.issued); + copy.counter = this.counter; + return copy; + } + } +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java new file mode 100644 index 000000000..12cd3f8b5 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java @@ -0,0 +1,73 @@ +package fr.inria.corese.core.next.impl.io.serialization.option; + +/** + * Configuration for Canonical RDF serialization format (RDFC-1.0). + * This class extends {@link AbstractSerializerOption} and provides specific defaults + * and options tailored for the RDFC-1.0 canonicalization algorithm. + * It includes options relevant to blank node canonicalization, such as the hashing algorithm + * to use, the depth factor for graph isomorphism, and the permutation limit. + * Use the {@link Builder} class to create instances of {@code CanonicalOption}. + * A predefined default configuration is available via {@link #defaultConfig()}. + */ +public class CanonicalOption extends AbstractSerializerOption { + + public enum HashAlgorithm { + SHA_256, + SHA_384 + } + + private final HashAlgorithm hashAlgorithm; + private final int depthFactor; + private final int permutationLimit; + + /** + * Protected constructor to be used by the {@link Builder}. + * + * @param builder The builder instance containing the desired configuration values. + */ + protected CanonicalOption(Builder builder) { + super(builder); + this.hashAlgorithm = builder.hashAlgorithm; + this.depthFactor = builder.depthFactor; + this.permutationLimit = builder.permutationLimit; + } + + + public HashAlgorithm getHashAlgorithm() { + return hashAlgorithm; + } + + + public int getPermutationLimit() { + return permutationLimit; + } + + /** + * Public Builder for {@link CanonicalOption}. + * Provides a fluent API for constructing {@code CanonicalOption} instances with default values + * specific to the Canonical RDF format. + */ + public static class Builder extends AbstractSerializerOption.AbstractBuilder { + private HashAlgorithm hashAlgorithm = HashAlgorithm.SHA_256; + private int depthFactor = 5; + private int permutationLimit = 50000; + + public Builder() { + //Default constructor initializes all options with their default values for Canonical RDF. + } + + @Override + public CanonicalOption build() { + return new CanonicalOption(this); + } + } + + + public static CanonicalOption defaultConfig() { + return new Builder().build(); + } + + public static CanonicalOption.Builder builder() { + return new CanonicalOption.Builder(); + } +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java index dda97db5f..d941b2fc6 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java @@ -107,4 +107,16 @@ private SerializationConstants() { public static final String DEFAULT_GRAPH_IRI = "http://ns.inria.fr/corese/default-graph"; + public static final String C14N = "_c14n"; + public static final String SPACE_POINT = " ."; + + public static final String CANONICAL_BNODE_PLACEHOLDER = "<>"; + public static final String HEX_FORMAT = "%02x"; + public static final String CANONICAL_BNODE_PREFIX = "_:b"; + + + // Algorithmes de hachage + public static final String SHA_256 = "SHA-256"; + public static final String SHA_384 = "SHA-384"; + } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java new file mode 100644 index 000000000..6eee9d223 --- /dev/null +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -0,0 +1,117 @@ +package fr.inria.corese.core.next.impl.io.serialization.util; + +import fr.inria.corese.core.next.api.*; + +import java.util.Map; + +/** + * Utility class for handling Statement manipulation during RDFC-1.0 canonicalization. + * This class provides methods to create new statements with replaced blank node identifiers + * and to serialize them for comparison and hashing. + */ +public class StatementUtils { + + private final ValueFactory valueFactory; + + public StatementUtils(ValueFactory valueFactory) { + this.valueFactory = valueFactory; + } + + /** + * Creates a new statement with blank nodes replaced according to the canonical mapping. + * + * @param originalStatement The original statement + * @param canonicalMapping Map from original blank node IDs to canonical IDs + * @return A new statement with replaced blank node identifiers + */ + public Statement replaceBlankNodes(Statement originalStatement, Map canonicalMapping) { + + Resource newSubject = replaceIfBlankNodeResource(originalStatement.getSubject(), canonicalMapping); + IRI newPredicate = originalStatement.getPredicate(); + Value newObject = replaceIfBlankNodeValue(originalStatement.getObject(), canonicalMapping); + Resource newContext = replaceIfBlankNodeResource(originalStatement.getContext(), canonicalMapping); + + return valueFactory.createStatement(newSubject, newPredicate, newObject, newContext); + } + + private Resource replaceIfBlankNodeResource(Resource original, Map mapping) { + + if (original != null && isBlankNode(original)) { + String canonicalId = mapping.getOrDefault(getBlankNodeId(original), getBlankNodeId(original)); + return valueFactory.createBNode(canonicalId); + } + return original; + } + + private Value replaceIfBlankNodeValue(Value original, Map mapping) { + + if (original != null && isBlankNode(original)) { + String canonicalId = mapping.getOrDefault(getBlankNodeId(original), getBlankNodeId(original)); + return valueFactory.createBNode(canonicalId); + } + return original; + } + + /** + * Checks if a value is a blank node. + * + * @param value The value to check. + * @return True if the value is a blank node, false otherwise. + */ + public static boolean isBlankNode(Value value) { + return value != null && value.isBNode(); + } + + /** + * Gets the identifier string for a blank node. + * + * @param value The blank node value. + * @return The string identifier. + */ + public static String getBlankNodeId(Value value) { + return value.stringValue(); + } + + /** + * Converts a value to a string for lexicographic comparison, as defined by RDFC-1.0. + * + * @param value The value to convert. + * @return The N-Quads representation for comparison. + */ + public static String serializeForComparison(Value value) { + if (value == null) return SerializationConstants.EMPTY_STRING; + String valueStr = value.stringValue(); + + if (value.isBNode()) { + return valueStr; + } + + if (value.isIRI()) { + return SerializationConstants.LT + valueStr + SerializationConstants.GT; + } + + return SerializationConstants.QUOTE + valueStr + SerializationConstants.QUOTE; + } + + /** + * Converts a statement to N-Quads format for lexicographic comparison. + * This uses a simplified serialization for comparison purposes only. + * + * @param statement The statement to convert + * @return The N-Quads representation + */ + public static String toNQuad(Statement statement) { + StringBuilder sb = new StringBuilder(); + + sb.append(serializeForComparison(statement.getSubject())).append(SerializationConstants.SPACE); + sb.append(serializeForComparison(statement.getPredicate())).append(SerializationConstants.SPACE); + sb.append(serializeForComparison(statement.getObject())); + + if (statement.getContext() != null) { + sb.append(SerializationConstants.SPACE).append(serializeForComparison(statement.getContext())); + } + + sb.append(SerializationConstants.SPACE_POINT); + return sb.toString(); + } +} diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java index f1b354a3d..111a7b998 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java @@ -103,7 +103,7 @@ void createSerializer_shouldReturnXmlSerializer_forRdfXmlFormat() { @DisplayName("createSerializer should return CanonicalSerializer for CANONICAL_RDF format") void createSerializer_shouldReturnCanonicalSerializer_forCanonicalRdfFormat() { try (MockedConstruction mockedConstruction = mockConstruction(CanonicalSerializer.class)) { - RDFSerializer serializer = factory.createSerializer(RDFFormat.CANONICAL_RDF, mockModel, mockConfig); + RDFSerializer serializer = factory.createSerializer(RDFFormat.RDFC_1_0, mockModel, mockConfig); assertNotNull(serializer); assertTrue(serializer instanceof CanonicalSerializer); diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java index 7cff3baf4..80d9892c3 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java @@ -1,5 +1,6 @@ package fr.inria.corese.core.next.impl.io.serialization.canonical; +import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -20,7 +21,7 @@ void defaultConfig_shouldReturnExpectedValues() { assertNotNull(config, "Default config should not be null"); assertTrue(config.isStrictMode(), "Default strictMode should be true for canonicalization"); assertTrue(config.validateURIs(), "Default validateURIs should be true for canonicalization"); - assertTrue(config.escapeUnicode(), "Default escapeUnicode should be true for canonicalization"); + assertFalse(config.escapeUnicode(), "Default escapeUnicode should be false for canonicalization"); assertTrue(config.trailingDot(), "Default trailingDot should be true for canonicalization"); assertFalse(config.includeContext(), "Default includeContext should be false for canonicalization (N-Triples like)"); } @@ -53,7 +54,7 @@ void builder_shouldUseDefaultValues_forUnsetOptions() { assertFalse(config.isStrictMode(), "strictMode should be overridden to false"); assertTrue(config.validateURIs(), "validateURIs should remain default (true)"); - assertTrue(config.escapeUnicode(), "escapeUnicode should remain default (true)"); + assertFalse(config.escapeUnicode(), "escapeUnicode should remain default (false)"); assertTrue(config.trailingDot(), "trailingDot should remain default (true)"); assertFalse(config.includeContext(), "includeContext should remain default (false)"); } diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java index f9f6ddf35..ac78cbe53 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java @@ -2,6 +2,7 @@ import fr.inria.corese.core.next.api.*; import fr.inria.corese.core.next.impl.exception.SerializationException; +import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -15,15 +16,14 @@ import java.util.Collections; import java.util.List; import java.util.Random; -import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; /** * Unit tests for the CanonicalSerializer class. - * These tests verify the canonicalization of blank nodes, the sorting of statements, - * and the correct serialization of the RDF model. + * These tests verify that the serializer correctly delegates to an RDFC-1.0 canonicalization + * component and formats the resulting canonical statements. */ class CanonicalSerializerTest { @@ -31,16 +31,43 @@ class CanonicalSerializerTest { private Model mockModel; @Mock private ValueFactory mockValueFactory; + @Mock + private Rdfc10Canonicalizer mockCanonicalizer; + @Mock + private BNode mockBNodeE0; + @Mock + private BNode mockBNodeE1; + @Mock + private BNode mockBNodeE2; + @Mock + private BNode mockBNodeE3; + @Mock - private BNode mockBNode1; + private BNode canonicalBNodeC0; + @Mock + private BNode canonicalBNodeC1; @Mock - private BNode mockBNode2; + private BNode canonicalBNodeC2; + @Mock + private BNode canonicalBNodeC3; @Mock - private BNode canonicalBNode1; + private BNode actualBNodeB0; + @Mock + private BNode actualBNodeB1; @Mock - private BNode canonicalBNode2; + private BNode actualBNodeB2; + @Mock + private BNode actualBNodeB3; + + + @Mock + private IRI mockIRIP; + @Mock + private IRI mockIRIQ; + @Mock + private IRI mockIRIR; @Mock private IRI mockIRI1; @@ -51,6 +78,7 @@ class CanonicalSerializerTest { @Mock private Literal mockLiteral2; + private CanonicalSerializer serializer; private CanonicalOption defaultConfig; @@ -61,11 +89,9 @@ void setUp() { setupBasicMocks(); - - serializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory) { + serializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, mockCanonicalizer) { @Override protected void writeValue(Writer w, Value v) throws IOException { - if (v != null) { w.write(v.stringValue()); } @@ -78,26 +104,52 @@ protected void writeValue(Writer w, Value v) throws IOException { * This ensures consistency across tests. */ private void setupBasicMocks() { + when(mockIRI1.stringValue()).thenReturn(""); when(mockIRI2.stringValue()).thenReturn(""); + when(mockIRI1.isBNode()).thenReturn(false); + when(mockIRI2.isBNode()).thenReturn(false); - when(mockLiteral1.stringValue()).thenReturn("\"literal1\""); - when(mockLiteral2.stringValue()).thenReturn("\"literal2\""); - - when(mockBNode1.stringValue()).thenReturn("_:originalBNode1"); - when(mockBNode2.stringValue()).thenReturn("_:originalBNode2"); + when(mockIRIP.stringValue()).thenReturn(""); + when(mockIRIQ.stringValue()).thenReturn(""); + when(mockIRIR.stringValue()).thenReturn(""); + when(mockIRIP.isBNode()).thenReturn(false); + when(mockIRIQ.isBNode()).thenReturn(false); + when(mockIRIR.isBNode()).thenReturn(false); - when(canonicalBNode1.stringValue()).thenReturn("_:b0"); - when(canonicalBNode2.stringValue()).thenReturn("_:b1"); - when(mockIRI1.isBNode()).thenReturn(false); - when(mockIRI2.isBNode()).thenReturn(false); + when(mockLiteral1.stringValue()).thenReturn("\"literal1\""); + when(mockLiteral2.stringValue()).thenReturn("\"literal2\""); when(mockLiteral1.isBNode()).thenReturn(false); when(mockLiteral2.isBNode()).thenReturn(false); - when(mockBNode1.isBNode()).thenReturn(true); - when(mockBNode2.isBNode()).thenReturn(true); - when(canonicalBNode1.isBNode()).thenReturn(true); - when(canonicalBNode2.isBNode()).thenReturn(true); + + when(mockBNodeE0.stringValue()).thenReturn("_:e0"); + when(mockBNodeE1.stringValue()).thenReturn("_:e1"); + when(mockBNodeE2.stringValue()).thenReturn("_:e2"); + when(mockBNodeE3.stringValue()).thenReturn("_:e3"); + when(mockBNodeE0.isBNode()).thenReturn(true); + when(mockBNodeE1.isBNode()).thenReturn(true); + when(mockBNodeE2.isBNode()).thenReturn(true); + when(mockBNodeE3.isBNode()).thenReturn(true); + + + when(canonicalBNodeC0.stringValue()).thenReturn("_:c14n0"); + when(canonicalBNodeC1.stringValue()).thenReturn("_:c14n1"); + when(canonicalBNodeC2.stringValue()).thenReturn("_:c14n2"); + when(canonicalBNodeC3.stringValue()).thenReturn("_:c14n3"); + when(canonicalBNodeC0.isBNode()).thenReturn(true); + when(canonicalBNodeC1.isBNode()).thenReturn(true); + when(canonicalBNodeC2.isBNode()).thenReturn(true); + when(canonicalBNodeC3.isBNode()).thenReturn(true); + + when(actualBNodeB0.stringValue()).thenReturn("_:b0"); + when(actualBNodeB1.stringValue()).thenReturn("_:b1"); + when(actualBNodeB2.stringValue()).thenReturn("_:b2"); + when(actualBNodeB3.stringValue()).thenReturn("_:b3"); + when(actualBNodeB0.isBNode()).thenReturn(true); + when(actualBNodeB1.isBNode()).thenReturn(true); + when(actualBNodeB2.isBNode()).thenReturn(true); + when(actualBNodeB3.isBNode()).thenReturn(true); } /** @@ -129,49 +181,53 @@ private Statement createMockStatement(Resource subject, IRI predicate, Value obj @DisplayName("Constructor with valid parameters should create an instance") void testConstructorWithValidParameters() { assertNotNull(serializer); - assertEquals("Canonical RDF", serializer.getFormatName()); + assertEquals("RDFC-1.0", serializer.getFormatName()); } @Test @DisplayName("Constructor with null model should throw NullPointerException") void testConstructorNullModel() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(null, defaultConfig, mockValueFactory)); + new CanonicalSerializer(null, defaultConfig, mockValueFactory, mockCanonicalizer)); } @Test @DisplayName("Constructor with null valueFactory should throw NullPointerException") void testConstructorNullValueFactory() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, defaultConfig, null)); + new CanonicalSerializer(mockModel, defaultConfig, null, mockCanonicalizer)); } @Test @DisplayName("Constructor with null config should throw NullPointerException") void testConstructorNullConfig() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, null, mockValueFactory)); + new CanonicalSerializer(mockModel, null, mockValueFactory, mockCanonicalizer)); + } + + @Test + @DisplayName("Constructor with null canonicalizer should throw NullPointerException") + void testConstructorNullCanonicalizer() { + assertThrows(NullPointerException.class, () -> + new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, null)); } @Test @DisplayName("Constructor with default configuration") void testConstructorWithDefaultConfig() { - CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, mockValueFactory); + CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, mockCanonicalizer); assertNotNull(defaultSerializer); - assertEquals("Canonical RDF", defaultSerializer.getFormatName()); + assertEquals("RDFC-1.0", defaultSerializer.getFormatName()); } @Test @DisplayName("Serialization of an empty model") void testSerializeEmptyModel() throws SerializationException { - - doAnswer(invocation -> { - return null; - }).when(mockModel).forEach(any()); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.emptyList()); StringWriter writer = new StringWriter(); - serializer.write(writer); + serializer.serialize(writer); assertEquals("", writer.toString()); } @@ -181,69 +237,61 @@ void testSerializeEmptyModel() throws SerializationException { void testSerializeSimpleStatement() throws SerializationException { Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); - doAnswer(invocation -> { - ((Consumer) invocation.getArgument(0)).accept(simpleStmt); - return null; - }).when(mockModel).forEach(any()); - - - when(mockValueFactory.createStatement(mockIRI1, mockIRI2, mockLiteral1, null)) - .thenReturn(simpleStmt); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(simpleStmt)); StringWriter writer = new StringWriter(); - serializer.write(writer); + serializer.serialize(writer); String expectedOutput = " \"literal1\" .\n"; assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); } @Test - @DisplayName("Serialization with blank nodes - canonicalization and output sorting") + @DisplayName("Serialization with blank nodes - W3C canonicalization and output sorting") void testSerializeWithBlankNodesAndOutputVerification() throws SerializationException { - Statement originalStmt1 = createMockStatement(mockBNode1, mockIRI1, mockLiteral1, null); - Statement originalStmt2 = createMockStatement(mockIRI2, mockIRI1, mockBNode2, null); - Statement originalStmt3 = createMockStatement(mockBNode1, mockIRI2, mockIRI1, null); - - List statementsInModel = Arrays.asList(originalStmt3, originalStmt1, originalStmt2); - Collections.shuffle(statementsInModel, new Random(0)); - - doAnswer(invocation -> { - ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(0)); - ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(1)); - ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(2)); - return null; - }).when(mockModel).forEach(any()); - - - when(mockValueFactory.createBNode("b0")).thenReturn(canonicalBNode1); - when(mockValueFactory.createBNode("b1")).thenReturn(canonicalBNode2); + Statement inputStmt1 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); + Statement inputStmt2 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); + Statement inputStmt3 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); + Statement inputStmt4 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); + Statement inputStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); + List originalStatementsFromModel = Arrays.asList(inputStmt1, inputStmt2, inputStmt3, inputStmt4, inputStmt5); + Collections.shuffle(originalStatementsFromModel, new Random(0)); - Statement canonicalStmtA = createMockStatement(canonicalBNode1, mockIRI1, mockLiteral1, null); - Statement canonicalStmtB = createMockStatement(mockIRI2, mockIRI1, canonicalBNode2, null); - Statement canonicalStmtC = createMockStatement(canonicalBNode1, mockIRI2, mockIRI1, null); + Statement canonicalOutputStmt1 = createMockStatement(mockIRIP, mockIRIQ, canonicalBNodeC2, null); + Statement canonicalOutputStmt2 = createMockStatement(mockIRIP, mockIRIQ, canonicalBNodeC3, null); + Statement canonicalOutputStmt3 = createMockStatement(canonicalBNodeC0, mockIRIR, canonicalBNodeC1, null); + Statement canonicalOutputStmt4 = createMockStatement(canonicalBNodeC2, mockIRIP, canonicalBNodeC1, null); + Statement canonicalOutputStmt5 = createMockStatement(canonicalBNodeC3, mockIRIP, canonicalBNodeC0, null); + List expectedCanonicalStatementsSorted = Arrays.asList( + canonicalOutputStmt1, + canonicalOutputStmt2, + canonicalOutputStmt3, + canonicalOutputStmt4, + canonicalOutputStmt5 + ); - when(mockValueFactory.createStatement(any(), any(), any(), any())) - .thenReturn(canonicalStmtB, canonicalStmtA, canonicalStmtC); - + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(expectedCanonicalStatementsSorted); StringWriter writer = new StringWriter(); - serializer.write(writer); + serializer.serialize(writer); String expectedOutput = """ - _:b1 . - _:b0 "literal1" . - _:b0 . + _:c14n2 . + _:c14n3 . + _:c14n0 _:c14n1 . + _:c14n2 _:c14n1 . + _:c14n3 _:c14n0 . """; assertEquals(expectedOutput, writer.toString()); - verify(mockValueFactory).createBNode("b0"); - verify(mockValueFactory).createBNode("b1"); - verify(mockValueFactory, times(3)).createStatement(any(), any(), any(), any()); + verify(mockCanonicalizer).canonicalize(any(Model.class)); } @Test @@ -251,22 +299,15 @@ void testSerializeWithBlankNodesAndOutputVerification() throws SerializationExce void testSerializeWithContext() throws SerializationException { Statement stmtWithContext = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, mockIRI1); - doAnswer(invocation -> { - ((Consumer) invocation.getArgument(0)).accept(stmtWithContext); - return null; - }).when(mockModel).forEach(any()); - - - when(mockValueFactory.createStatement(mockIRI1, mockIRI2, mockLiteral1, mockIRI1)) - .thenReturn(stmtWithContext); - + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(stmtWithContext)); StringWriter writer = new StringWriter(); - - serializer.write(writer); + serializer.serialize(writer); String expectedOutput = " \"literal1\" .\n"; assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); } @Test @@ -284,12 +325,10 @@ void testWriteContextWithNullContext() throws IOException { @Test @DisplayName("Test writeContext with non-null context") void testWriteContextWithNonNullContext() throws IOException { - StringWriter writer = new StringWriter(); Statement stmt = mock(Statement.class); when(stmt.getContext()).thenReturn(mockIRI1); - serializer.writeContext(writer, stmt); String expectedOutput = " "; @@ -301,30 +340,134 @@ void testWriteContextWithNonNullContext() throws IOException { @DisplayName("Serialization with blank nodes in context - canonicalization and sorting") void testSerializeWithBlankNodeInContextAndOutputVerification() throws SerializationException { - Statement originalStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, mockBNode1); + Statement canonicalOutputStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, canonicalBNodeC0); + + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(canonicalOutputStmt1)); + + StringWriter writer = new StringWriter(); + + serializer.serialize(writer); + + String expectedOutput = " \"literal1\" _:c14n0 .\n"; + assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); + } + + @Test + void testSerializeW3CExampleWithDifferentActualOutput() throws SerializationException { + Statement inputStmt1 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); + Statement inputStmt2 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); + Statement inputStmt3 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); + Statement inputStmt4 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); + Statement inputStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); + + List originalStatementsFromModel = Arrays.asList(inputStmt1, inputStmt2, inputStmt3, inputStmt4, inputStmt5); + Collections.shuffle(originalStatementsFromModel, new Random(0)); + + Statement actualOutputStmt1 = createMockStatement(actualBNodeB0, mockIRIR, actualBNodeB2, null); + Statement actualOutputStmt2 = createMockStatement(actualBNodeB1, mockIRIP, actualBNodeB0, null); + Statement actualOutputStmt3 = createMockStatement(actualBNodeB3, mockIRIP, actualBNodeB2, null); + Statement actualOutputStmt4 = createMockStatement(mockIRIP, mockIRIQ, actualBNodeB1, null); + Statement actualOutputStmt5 = createMockStatement(mockIRIP, mockIRIQ, actualBNodeB3, null); - List statementsInModel = Collections.singletonList(originalStmt1); - doAnswer(invocation -> { - ((Consumer) invocation.getArgument(0)).accept(statementsInModel.get(0)); - return null; - }).when(mockModel).forEach(any()); - when(mockValueFactory.createBNode("b0")).thenReturn(canonicalBNode1); + List actualCanonicalStatementsSorted = Arrays.asList( + actualOutputStmt1, + actualOutputStmt2, + actualOutputStmt3, + actualOutputStmt4, + actualOutputStmt5 + ); - Statement canonicalStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, canonicalBNode1); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(actualCanonicalStatementsSorted); + + StringWriter writer = new StringWriter(); + + serializer.serialize(writer); + + String expectedOutput = """ + _:b0 _:b2 . + _:b1 _:b0 . + _:b3 _:b2 . + _:b1 . + _:b3 . + """; + assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); + } + + @Test + @DisplayName("Serialization without trailing dot") + void testSerializeNoTrailingDot() throws SerializationException { + CanonicalOption noDotConfig = CanonicalOption.builder().trailingDot(false).build(); + CanonicalSerializer noDotSerializer = new CanonicalSerializer(mockModel, noDotConfig, mockValueFactory, mockCanonicalizer) { + @Override + protected void writeValue(Writer w, Value v) throws IOException { + if (v != null) { + w.write(v.stringValue()); + } + } + }; + + Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); - when(mockValueFactory.createStatement(any(), any(), any(), any())) - .thenReturn(canonicalStmt1); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(simpleStmt)); StringWriter writer = new StringWriter(); + noDotSerializer.serialize(writer); - serializer.write(writer); + String expectedOutput = " \"literal1\"\n"; + assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); + } + + @Test + @DisplayName("Serialization with different line ending") + void testSerializeDifferentLineEnding() throws SerializationException { + CanonicalOption customLineEndingConfig = CanonicalOption.builder().lineEnding("\r\n").build(); + CanonicalSerializer customLineEndingSerializer = new CanonicalSerializer(mockModel, customLineEndingConfig, mockValueFactory, mockCanonicalizer) { + @Override + protected void writeValue(Writer w, Value v) throws IOException { + if (v != null) { + w.write(v.stringValue()); + } + } + }; + + Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(simpleStmt)); - String expectedOutput = " \"literal1\" _:b0 .\n"; + StringWriter writer = new StringWriter(); + + customLineEndingSerializer.serialize(writer); + + String expectedOutput = " \"literal1\" .\r\n"; assertEquals(expectedOutput, writer.toString()); + } + + @Test + @DisplayName("Serialization with a mix of statements (with and without context)") + void testSerializeMixedStatements() throws SerializationException { + Statement stmt1 = createMockStatement(mockIRI1, mockIRIP, mockLiteral1, null); + Statement stmt2 = createMockStatement(mockIRI2, mockIRIQ, mockLiteral2, mockIRI1); + Statement stmt3 = createMockStatement(mockIRI1, mockIRIR, mockLiteral2, null); + + List mixedStatements = Arrays.asList(stmt1, stmt2, stmt3); + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(mixedStatements); - verify(mockValueFactory).createBNode("b0"); - verify(mockValueFactory, times(1)).createStatement(any(), any(), any(), any()); + StringWriter writer = new StringWriter(); + serializer.serialize(writer); + + String expectedOutput = """ + "literal1" . + "literal2" . + "literal2" . + """; + assertEquals(expectedOutput, writer.toString()); } + } From d08f2dc4fd6c1e071ac3386db5a1517d4337ec3a Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 8 Sep 2025 10:34:20 +0200 Subject: [PATCH 4/6] #187 Implement RDF canonicalization as serializer --- .../DefaultSerializerFactory.java | 8 +- .../CanonicalOption.java | 20 +- .../canonical/CanonicalSerializer.java | 32 +- .../canonical/Rdfc10Canonicalizer.java | 448 +++++++++++++++++- .../canonical/Rdfc10CanonicalizerImpl.java | 354 -------------- .../util/SerializationConstants.java | 1 - .../io/serialization/util/StatementUtils.java | 2 +- .../canonical/CanonicalOptionTest.java | 1 - .../canonical/CanonicalSerializerTest.java | 68 ++- 9 files changed, 531 insertions(+), 403 deletions(-) rename src/main/java/fr/inria/corese/core/next/impl/io/serialization/{option => canonical}/CanonicalOption.java (79%) delete mode 100644 src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java index 75ab94cc8..511c45830 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java @@ -6,10 +6,9 @@ import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; import fr.inria.corese.core.next.api.io.serialization.SerializerFactory; -import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; +import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalOption; import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Canonicalizer; -import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10CanonicalizerImpl; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsOption; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesOption; @@ -112,7 +111,7 @@ public DefaultSerializerFactory() { tempRegistry.put(RDFFormat.RDFC_1_0, (model, genericConfig) -> { if (genericConfig instanceof CanonicalOption specificConfig) { - Rdfc10Canonicalizer canonicalizer = new Rdfc10CanonicalizerImpl( + Rdfc10Canonicalizer canonicalizer = new Rdfc10Canonicalizer( specificConfig.getHashAlgorithm(), specificConfig.getPermutationLimit(), coreseValueFactory @@ -122,7 +121,7 @@ public DefaultSerializerFactory() { logger.warn("Provided config for RDFC_1_0 is not CanonicalOption (was {}). Using default CanonicalOption.", genericConfig != null ? genericConfig.getClass().getSimpleName() : "null"); CanonicalOption defaultConfig = CanonicalOption.defaultConfig(); - Rdfc10Canonicalizer canonicalizer = new Rdfc10CanonicalizerImpl( + Rdfc10Canonicalizer canonicalizer = new Rdfc10Canonicalizer( defaultConfig.getHashAlgorithm(), defaultConfig.getPermutationLimit(), coreseValueFactory @@ -131,6 +130,7 @@ public DefaultSerializerFactory() { } }); + this.registry = Collections.unmodifiableMap(tempRegistry); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java similarity index 79% rename from src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java rename to src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java index 12cd3f8b5..02b57e2d5 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/option/CanonicalOption.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java @@ -1,9 +1,11 @@ -package fr.inria.corese.core.next.impl.io.serialization.option; +package fr.inria.corese.core.next.impl.io.serialization.canonical; + +import fr.inria.corese.core.next.impl.io.serialization.option.AbstractSerializerOption; /** * Configuration for Canonical RDF serialization format (RDFC-1.0). * This class extends {@link AbstractSerializerOption} and provides specific defaults - * and options tailored for the RDFC-1.0 canonicalization algorithm. + * and options tailored for the RDFC-10 canonicalization algorithm. * It includes options relevant to blank node canonicalization, such as the hashing algorithm * to use, the depth factor for graph isomorphism, and the permutation limit. * Use the {@link Builder} class to create instances of {@code CanonicalOption}. @@ -37,6 +39,16 @@ public HashAlgorithm getHashAlgorithm() { return hashAlgorithm; } + /** + * Gets the depth factor for graph isomorphism resolution. + * This value is used to limit the depth of the recursive hashing algorithm. + * + * @return The depth factor. + */ + public int getDepthFactor() { + return depthFactor; + } + public int getPermutationLimit() { return permutationLimit; @@ -53,7 +65,7 @@ public static class Builder extends AbstractSerializerOption.AbstractBuilder * This implementation now acts as a wrapper, preparing the model for a dedicated * RDFC-1.0 canonicalization component and then writing the resulting canonical statements. */ public class CanonicalSerializer extends AbstractLineBasedSerializer implements RDFSerializer { - private final ValueFactory valueFactory; private final CanonicalOption config; private final Rdfc10Canonicalizer canonicalizer; private final Model model; @@ -36,15 +34,14 @@ public class CanonicalSerializer extends AbstractLineBasedSerializer implements * Constructs a new CanonicalSerializer. * This constructor is now adapted to be used by the DefaultSerializerFactory. * - * @param model The model to be serialized. - * @param config The configuration options for the canonicalization process. - * @param valueFactory The factory for creating RDF values. - * @param canonicalizer The canonicalizer component to use. + * @param model The model to be serialized. + * @param config The configuration options for the canonicalization process. + * @param valueFactory The factory for creating RDF values. + * @param canonicalizer The canonicalizer component to use. */ public CanonicalSerializer(Model model, CanonicalOption config, ValueFactory valueFactory, Rdfc10Canonicalizer canonicalizer) { super(model, config); this.model = Objects.requireNonNull(model); - this.valueFactory = Objects.requireNonNull(valueFactory); this.config = Objects.requireNonNull(config); this.canonicalizer = Objects.requireNonNull(canonicalizer); } @@ -54,7 +51,15 @@ public String getFormatName() { return "RDFC-1.0"; } - public void serialize(Writer writer) { + /** + * Serializes the model into the specified writer using the RDFC-1.0 canonical format. + * The model is first canonicalized by the internal canonicalizer component, and then + * the resulting statements are written line by line to the writer. + * + * @param writer the {@link Writer} to which the serialized model will be written. + * @throws SerializationException if serialization fails due to an I/O error or invalid data. + */ + public void write(Writer writer) { try (BufferedWriter bufferedWriter = new BufferedWriter(writer)) { List canonicalStatements = canonicalizer.canonicalize(model); @@ -69,6 +74,13 @@ public void serialize(Writer writer) { } } + /** + * Writes the context (graph URI) of a statement to the writer. + * + * @param writer the {@link BufferedWriter} to which the context will be written. + * @param stmt the statement whose context will be written. + * @throws IOException if an I/O error occurs. + */ @Override protected void writeContext(Writer writer, Statement stmt) throws IOException { Resource context = stmt.getContext(); @@ -105,4 +117,4 @@ private void writeCanonicalStatement(Writer writer, Statement stmt) throws IOExc } -} \ No newline at end of file +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java index f538f5476..a8caeb858 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java @@ -1,24 +1,448 @@ package fr.inria.corese.core.next.impl.io.serialization.canonical; -import fr.inria.corese.core.next.api.Statement; import fr.inria.corese.core.next.api.Model; +import fr.inria.corese.core.next.api.Statement; +import fr.inria.corese.core.next.api.ValueFactory; +import fr.inria.corese.core.next.impl.exception.SerializationException; +import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; +import fr.inria.corese.core.next.impl.io.serialization.util.StatementUtils; -import java.util.List; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; +import java.util.stream.Stream; /** - * Interface for a component that performs RDFC-1.0 canonicalization. - * This component is responsible for re-labeling blank nodes and sorting statements - * according to the RDFC-1.0 specification. + * Implementation of the RDFC-1.0 canonicalization algorithm as specified by W3C. + * This class is responsible for deterministically re-labeling blank nodes and + * sorting all RDF statements to produce a canonical representation of a dataset. */ -public interface Rdfc10Canonicalizer { +public class Rdfc10Canonicalizer { + + private final CanonicalOption.HashAlgorithm hashAlgorithm; + private final int maxCallsHashNDegreeQuads; + private final StatementUtils statementUtils; + private int callsHashNDegreeQuads = 0; + /** - * Canonicalizes a stream of RDF statements from a given model. - * The implementation will handle all steps of the RDFC-10 algorithm, - * including dataset normalization, blank node identification, and - * deterministic sorting. + * Constructs a new Rdfc10Canonicalizer. + * + * @param hashAlgorithm The hashing algorithm to use, e.g., SHA-256 or SHA-384. + * @param maxCalls The maximum number of recursive calls to the Hash N-Degree Quads algorithm + * to prevent infinite loops on complex cyclic graphs. + * @param valueFactory The factory for creating RDF values, used by StatementUtils. + */ + public Rdfc10Canonicalizer(CanonicalOption.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { + this.hashAlgorithm = hashAlgorithm; + this.maxCallsHashNDegreeQuads = maxCalls; + this.statementUtils = new StatementUtils(valueFactory); + } + + + /** + * Canonicalizes all statements within a given {@link Model}. + * This is the main entry point for the canonicalization process. * * @param model The input model to canonicalize. * @return A list of canonicalized and sorted statements. */ - List canonicalize(Model model); -} \ No newline at end of file + public List canonicalize(Model model) { + return canonicalize(model.stream()); + } + + + /** + * Internal canonicalization method that processes a stream of statements. + * This method handles all the steps of the RDFC-1.0 algorithm, including: + *
    + *
  1. Creating a map of blank nodes to their associated quads.
  2. + *
  3. Generating a canonical replacement map for blank nodes using + * the Hash First Degree and Hash N-Degree Quads algorithms.
  4. + *
  5. Replacing the blank nodes in the statements.
  6. + *
  7. Sorting the final list of statements.
  8. + *
+ * @param statements A stream of statements to canonicalize. + * @return A list of canonicalized and sorted statements. + */ + private List canonicalize(Stream statements) { + List stmtList = statements.toList(); + + callsHashNDegreeQuads = 0; + + Map> blankNodeToQuads = createBNodeToQuadsMap(stmtList); + + if (blankNodeToQuads.isEmpty()) { + return stmtList.stream() + .sorted((s1, s2) -> StatementUtils.toNQuad(s1).compareTo(StatementUtils.toNQuad(s2))) + .toList(); + } + + Map canonicalReplacementMap = createCanonicalMap(blankNodeToQuads); + + return replaceBlankNodesAndSort(stmtList, canonicalReplacementMap); + } + + /** + * Creates a map where each blank node identifier is a key, and its value is a set + * of all statements (quads) in which the blank node appears as a subject, object, or graph URI. + * + * @param statements The list of statements to process. + * @return A map linking blank node identifiers to their associated quads. + */ + private Map> createBNodeToQuadsMap(List statements) { + Map> blankNodeToQuads = new HashMap<>(); + + for (Statement stmt : statements) { + if (stmt == null) continue; + + if (StatementUtils.isBlankNode(stmt.getSubject())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getSubject()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + + if (StatementUtils.isBlankNode(stmt.getObject())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getObject()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + + if (stmt.getContext() != null && StatementUtils.isBlankNode(stmt.getContext())) { + String blankNodeId = StatementUtils.getBlankNodeId(stmt.getContext()); + blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); + } + } + + return blankNodeToQuads; + } + + /** + * Performs the core canonicalization logic to create a map of blank node + * replacements. This method uses the "Hash First Degree Quads" and "Hash N-Degree Quads" + * algorithms to determine a canonical identifier for each blank node. + * + * @param blankNodeToQuads A map of blank nodes to their associated quads. + * @return A map from old blank node identifiers to new canonical ones. + */ + private Map createCanonicalMap(Map> blankNodeToQuads) { + Map canonicalIssuer = new HashMap<>(); + int canonicalCounter = 0; + + Map> hashToBlankNodes = new HashMap<>(); + + for (String blankNode : blankNodeToQuads.keySet()) { + String hash = hashFirstDegreeQuads(blankNode, blankNodeToQuads); + hashToBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(blankNode); + } + + List sortedHashes = new ArrayList<>(hashToBlankNodes.keySet()); + Collections.sort(sortedHashes); + + for (String hash : sortedHashes) { + Set blankNodes = hashToBlankNodes.get(hash); + + if (blankNodes.size() == 1) { + String blankNode = blankNodes.iterator().next(); + canonicalIssuer.put(blankNode, SerializationConstants.C14N + canonicalCounter++); + } else { + Map nDegreeHashes = new HashMap<>(); + + for (String blankNode : blankNodes) { + if (!canonicalIssuer.containsKey(blankNode)) { + TemporaryIssuer temporaryIssuer = new TemporaryIssuer(); + String nDegreeHash = hashNDegreQuads(blankNode, blankNodeToQuads, canonicalIssuer, temporaryIssuer); + nDegreeHashes.put(blankNode, nDegreeHash); + } + } + + List> sortedEntries = nDegreeHashes.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .toList(); + + for (Map.Entry entry : sortedEntries) { + if (!canonicalIssuer.containsKey(entry.getKey())) { + canonicalIssuer.put(entry.getKey(), SerializationConstants.C14N + canonicalCounter++); + } + } + } + } + + return canonicalIssuer; + } + + /** + * Implements the "Hash First Degree Quads" algorithm from the RDFC-1.0 specification. + * This method computes a hash for a blank node based on the canonical representation + * of all quads in which it appears, replacing the blank node itself with a placeholder. + * + * @param blankNode The blank node identifier to hash. + * @param blankNodeToQuads The map of blank nodes to their associated quads. + * @return A string representing the hash. + */ + private String hashFirstDegreeQuads(String blankNode, Map> blankNodeToQuads) { + Set quads = blankNodeToQuads.get(blankNode); + List nquads = new ArrayList<>(); + + for (Statement quad : quads) { + String nquad = quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + nquads.add(nquad); + } + + Collections.sort(nquads); + String toHash = String.join(SerializationConstants.EMPTY_STRING, nquads); + return hash(toHash); + } + + /** + * Implements the "Hash N-Degree Quads" algorithm from the RDFC-1.0 specification. + * This is a recursive algorithm that resolves permutations of blank nodes with + * the same first-degree hash by considering their related blank nodes and recursively + * hashing their graph contexts. + * + * @param identifier The blank node identifier to hash. + * @param blankNodeToQuads The map of blank nodes to their associated quads. + * @param canonicalIssuer The map of blank nodes that have already been assigned a canonical ID. + * @param issuer The temporary identifier issuer for the current permutation path. + * @return A string representing the hash. + * @throws SerializationException if the maximum number of recursive calls is exceeded. + */ + private String hashNDegreQuads(String identifier, Map> blankNodeToQuads, + Map canonicalIssuer, TemporaryIssuer issuer) { + + if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { + throw new SerializationException("Maximum calls to Hash N-Degree Quads exceeded: " + maxCallsHashNDegreeQuads, "Rdfc10Canonicalizer"); + } + + Map> hashToRelatedBlankNodes = new HashMap<>(); + Set quads = blankNodeToQuads.get(identifier); + + for (Statement quad : quads) { + Set relatedBlankNodes = getRelatedBlankNodes(quad, identifier); + + for (String relatedBlankNode : relatedBlankNodes) { + String hash; + if (canonicalIssuer.containsKey(relatedBlankNode)) { + hash = canonicalIssuer.get(relatedBlankNode); + } else if (issuer.hasIssued(relatedBlankNode)) { + hash = issuer.issue(relatedBlankNode); + } else { + hash = hashFirstDegreeQuads(relatedBlankNode, blankNodeToQuads); + } + hashToRelatedBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(relatedBlankNode); + } + } + + StringBuilder dataToHash = new StringBuilder(); + List sortedHashes = new ArrayList<>(hashToRelatedBlankNodes.keySet()); + Collections.sort(sortedHashes); + + for (String hash : sortedHashes) { + dataToHash.append(hash); + Set blankNodeList = hashToRelatedBlankNodes.get(hash); + + if (blankNodeList.size() > 1) { + List hashPathList = new ArrayList<>(); + + for (String relatedBlankNode : blankNodeList) { + if (canonicalIssuer.containsKey(relatedBlankNode)) { + hashPathList.add(canonicalIssuer.get(relatedBlankNode)); + } else { + TemporaryIssuer tempIssuer = issuer.copy(); + tempIssuer.issue(relatedBlankNode); + String hashPath = hashNDegreQuads(relatedBlankNode, blankNodeToQuads, canonicalIssuer, tempIssuer); + hashPathList.add(hashPath); + } + } + + Collections.sort(hashPathList); + dataToHash.append(String.join(SerializationConstants.EMPTY_STRING, hashPathList)); + } else { + String blankNode = blankNodeList.iterator().next(); + if (canonicalIssuer.containsKey(blankNode)) { + dataToHash.append(canonicalIssuer.get(blankNode)); + } else { + dataToHash.append(issuer.issue(blankNode)); + } + } + } + + return hash(dataToHash.toString()); + } + + + /** + * Converts a single quad to a canonical N-Quad string representation for hashing, + * replacing the specified blank node with a placeholder. + * + * @param quad The statement to convert. + * @param blankNode The blank node to replace. + * @param replacement The placeholder string to use for the blank node. + * @return A canonical N-Quad string. + */ + private String quadToNQuad(Statement quad, String blankNode, String replacement) { + StringBuilder sb = new StringBuilder(); + + if (StatementUtils.isBlankNode(quad.getSubject()) && StatementUtils.getBlankNodeId(quad.getSubject()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getSubject())); + } + sb.append(SerializationConstants.SPACE); + + sb.append(StatementUtils.serializeForComparison(quad.getPredicate())); + sb.append(SerializationConstants.SPACE); + + if (StatementUtils.isBlankNode(quad.getObject()) && StatementUtils.getBlankNodeId(quad.getObject()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getObject())); + } + + if (quad.getContext() != null) { + sb.append(SerializationConstants.SPACE); + if (StatementUtils.isBlankNode(quad.getContext()) && StatementUtils.getBlankNodeId(quad.getContext()).equals(blankNode)) { + sb.append(replacement); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getContext())); + } + } + + sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); + return sb.toString(); + } + + /** + * Finds all blank nodes in a given quad that are related to (but not the same as) + * the excluded blank node. + * + * @param quad The quad to inspect. + * @param excludeBlankNode The blank node to exclude from the results. + * @return A set of blank node identifiers related to the excluded blank node. + */ + private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode) { + Set relatedBlankNodes = new HashSet<>(); + + if (StatementUtils.isBlankNode(quad.getSubject())) { + String id = StatementUtils.getBlankNodeId(quad.getSubject()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + if (StatementUtils.isBlankNode(quad.getObject())) { + String id = StatementUtils.getBlankNodeId(quad.getObject()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + if (quad.getContext() != null && StatementUtils.isBlankNode(quad.getContext())) { + String id = StatementUtils.getBlankNodeId(quad.getContext()); + if (!id.equals(excludeBlankNode)) { + relatedBlankNodes.add(id); + } + } + + return relatedBlankNodes; + } + + /** + * Replaces the old blank node identifiers in a list of statements with their new + * canonical identifiers and then sorts the resulting statements. + * + * @param statements The list of statements to process. + * @param replacementMap The map from old blank node IDs to new canonical IDs. + * @return A sorted list of statements with canonical blank node IDs. + */ + private List replaceBlankNodesAndSort(List statements, Map replacementMap) { + return statements.stream() + .map(stmt -> { + Statement replaced = statementUtils.replaceBlankNodes(stmt, replacementMap); + if (replaced == null) { + throw new IllegalStateException("Failed to replace blank nodes in statement: " + stmt); + } + return replaced; + }) + .sorted(Comparator.comparing(StatementUtils::toNQuad)) + .toList(); + } + + /** + * Computes a cryptographic hash of the given data string using the configured + * hash algorithm (SHA-256 or SHA-384). + * + * @param data The data string to hash. + * @return A hexadecimal string representation of the hash. + * @throws SerializationException if the hash algorithm is not available or if hashing fails. + */ + private String hash(String data) { + try { + String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? SerializationConstants.SHA_384 : SerializationConstants.SHA_256; + MessageDigest digest = MessageDigest.getInstance(algorithm); + byte[] hash = digest.digest(data.getBytes(StandardCharsets.UTF_8)); + return bytesToHex(hash); + } catch (NoSuchAlgorithmException e) { + throw new SerializationException("Hash algorithm not available: " + e.getMessage(), "Rdfc10Canonicalizer", e); + } catch (Exception e) { + throw new SerializationException("Hash computation failed for data: " + data, "Rdfc10Canonicalizer", e); + } + } + + /** + * Converts a byte array to its hexadecimal string representation. + * + * @param bytes The byte array to convert. + * @return A hexadecimal string. + */ + private String bytesToHex(byte[] bytes) { + StringBuilder result = new StringBuilder(); + for (byte b : bytes) { + result.append(String.format(SerializationConstants.HEX_FORMAT, b)); + } + return result.toString(); + } + + /** + * Helper class for temporary identifier issuing during canonicalization. + * This is used during the recursive "Hash N-Degree Quads" algorithm to + * assign unique, temporary blank node identifiers within a single path. + */ + private static class TemporaryIssuer { + private Map issued = new HashMap<>(); + private int counter = 0; + + /** + * Issues a new temporary identifier for the given blank node identifier. + * If an identifier has already been issued for this blank node, it returns the existing one. + * + * @param identifier The blank node identifier to issue an ID for. + * @return A temporary canonical identifier. + */ + public String issue(String identifier) { + if (!issued.containsKey(identifier)) { + issued.put(identifier, SerializationConstants.CANONICAL_BNODE_PREFIX + counter++); + } + return issued.get(identifier); + } + + /** + * Checks if a temporary identifier has already been issued for the given blank node. + * @param identifier The blank node identifier to check. + * @return {@code true} if an identifier has been issued, {@code false} otherwise. + */ + public boolean hasIssued(String identifier) { + return issued.containsKey(identifier); + } + + /** + * Creates a copy of the current TemporaryIssuer instance. This is crucial for + * the recursive hashing algorithm to explore different permutation paths independently. + * @return A new instance with the same state. + */ + public TemporaryIssuer copy() { + TemporaryIssuer copy = new TemporaryIssuer(); + copy.issued = new HashMap<>(this.issued); + copy.counter = this.counter; + return copy; + } + } +} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java deleted file mode 100644 index 8a29fc2f3..000000000 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10CanonicalizerImpl.java +++ /dev/null @@ -1,354 +0,0 @@ -package fr.inria.corese.core.next.impl.io.serialization.canonical; - -import fr.inria.corese.core.next.api.Model; -import fr.inria.corese.core.next.api.Statement; -import fr.inria.corese.core.next.api.ValueFactory; -import fr.inria.corese.core.next.impl.exception.SerializationException; -import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; -import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; -import fr.inria.corese.core.next.impl.io.serialization.util.StatementUtils; - -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.*; -import java.util.stream.Stream; - -/** - * Implementation of the RDFC-1.0 canonicalization algorithm as specified by W3C. - */ -public class Rdfc10CanonicalizerImpl implements Rdfc10Canonicalizer { - - private final CanonicalOption.HashAlgorithm hashAlgorithm; - private final int maxCallsHashNDegreeQuads; - private final ValueFactory valueFactory; - private final StatementUtils statementUtils; - private int callsHashNDegreeQuads = 0; - - public Rdfc10CanonicalizerImpl(CanonicalOption.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { - this.hashAlgorithm = hashAlgorithm; - this.maxCallsHashNDegreeQuads = maxCalls; - this.valueFactory = valueFactory; - this.statementUtils = new StatementUtils(valueFactory); - } - - - @Override - public List canonicalize(Model model) { - return canonicalize(model.stream()); - } - - - /** - * Internal canonicalization method - */ - private List canonicalize(Stream statements) { - List stmtList = statements.toList(); - - callsHashNDegreeQuads = 0; - - Map> blankNodeToQuads = createBNodeToQuadsMap(stmtList); - - if (blankNodeToQuads.isEmpty()) { - return stmtList.stream() - .sorted((s1, s2) -> StatementUtils.toNQuad(s1).compareTo(StatementUtils.toNQuad(s2))) - .toList(); - } - - Map canonicalReplacementMap = createCanonicalMap(blankNodeToQuads); - - return replaceBlankNodesAndSort(stmtList, canonicalReplacementMap); - } - - /** - * Add validation in createBNodeToQuadsMap - */ - private Map> createBNodeToQuadsMap(List statements) { - Map> blankNodeToQuads = new HashMap<>(); - - for (Statement stmt : statements) { - if (stmt == null) continue; - - if (StatementUtils.isBlankNode(stmt.getSubject())) { - String blankNodeId = StatementUtils.getBlankNodeId(stmt.getSubject()); - blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); - } - - if (StatementUtils.isBlankNode(stmt.getObject())) { - String blankNodeId = StatementUtils.getBlankNodeId(stmt.getObject()); - blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); - } - - if (stmt.getContext() != null && StatementUtils.isBlankNode(stmt.getContext())) { - String blankNodeId = StatementUtils.getBlankNodeId(stmt.getContext()); - blankNodeToQuads.computeIfAbsent(blankNodeId, k -> new HashSet<>()).add(stmt); - } - } - - return blankNodeToQuads; - } - - /** - * Create canonical replacement map - */ - private Map createCanonicalMap(Map> blankNodeToQuads) { - Map canonicalIssuer = new HashMap<>(); - int canonicalCounter = 0; - - Map> hashToBlankNodes = new HashMap<>(); - - for (String blankNode : blankNodeToQuads.keySet()) { - String hash = hashFirstDegreeQuads(blankNode, blankNodeToQuads); - hashToBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(blankNode); - } - - List sortedHashes = new ArrayList<>(hashToBlankNodes.keySet()); - Collections.sort(sortedHashes); - - for (String hash : sortedHashes) { - Set blankNodes = hashToBlankNodes.get(hash); - - if (blankNodes.size() == 1) { - String blankNode = blankNodes.iterator().next(); - canonicalIssuer.put(blankNode, SerializationConstants.C14N + canonicalCounter++); - } else { - Map nDegreeHashes = new HashMap<>(); - - for (String blankNode : blankNodes) { - if (!canonicalIssuer.containsKey(blankNode)) { - TemporaryIssuer temporaryIssuer = new TemporaryIssuer(); - String nDegreeHash = hashNDegreQuads(blankNode, blankNodeToQuads, canonicalIssuer, temporaryIssuer); - nDegreeHashes.put(blankNode, nDegreeHash); - } - } - - List> sortedEntries = nDegreeHashes.entrySet().stream() - .sorted(Map.Entry.comparingByValue()) - .toList(); - - for (Map.Entry entry : sortedEntries) { - if (!canonicalIssuer.containsKey(entry.getKey())) { - canonicalIssuer.put(entry.getKey(), SerializationConstants.C14N + canonicalCounter++); - } - } - } - } - - return canonicalIssuer; - } - - /** - * Hash First Degree Quads algorithm - */ - private String hashFirstDegreeQuads(String blankNode, Map> blankNodeToQuads) { - Set quads = blankNodeToQuads.get(blankNode); - List nquads = new ArrayList<>(); - - for (Statement quad : quads) { - String nquad = quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - nquads.add(nquad); - } - - Collections.sort(nquads); - String toHash = String.join(SerializationConstants.EMPTY_STRING, nquads); - return hash(toHash); - } - - /** - * Hash N-Degree Quads algorithm - */ - private String hashNDegreQuads(String identifier, Map> blankNodeToQuads, - Map canonicalIssuer, TemporaryIssuer issuer) { - - if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { - throw new SerializationException("Maximum calls to Hash N-Degree Quads exceeded: " + maxCallsHashNDegreeQuads, "Rdfc10CanonicalizerImpl"); - } - - Map> hashToRelatedBlankNodes = new HashMap<>(); - Set quads = blankNodeToQuads.get(identifier); - - for (Statement quad : quads) { - Set relatedBlankNodes = getRelatedBlankNodes(quad, identifier); - - for (String relatedBlankNode : relatedBlankNodes) { - String hash; - if (canonicalIssuer.containsKey(relatedBlankNode)) { - hash = canonicalIssuer.get(relatedBlankNode); - } else if (issuer.hasIssued(relatedBlankNode)) { - hash = issuer.issue(relatedBlankNode); - } else { - hash = hashFirstDegreeQuads(relatedBlankNode, blankNodeToQuads); - } - hashToRelatedBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(relatedBlankNode); - } - } - - StringBuilder dataToHash = new StringBuilder(); - List sortedHashes = new ArrayList<>(hashToRelatedBlankNodes.keySet()); - Collections.sort(sortedHashes); - - for (String hash : sortedHashes) { - dataToHash.append(hash); - Set blankNodeList = hashToRelatedBlankNodes.get(hash); - - if (blankNodeList.size() > 1) { - List hashPathList = new ArrayList<>(); - - for (String relatedBlankNode : blankNodeList) { - if (canonicalIssuer.containsKey(relatedBlankNode)) { - hashPathList.add(canonicalIssuer.get(relatedBlankNode)); - } else { - TemporaryIssuer tempIssuer = issuer.copy(); - tempIssuer.issue(relatedBlankNode); - String hashPath = hashNDegreQuads(relatedBlankNode, blankNodeToQuads, canonicalIssuer, tempIssuer); - hashPathList.add(hashPath); - } - } - - Collections.sort(hashPathList); - dataToHash.append(String.join(SerializationConstants.EMPTY_STRING, hashPathList)); - } else { - String blankNode = blankNodeList.iterator().next(); - if (canonicalIssuer.containsKey(blankNode)) { - dataToHash.append(canonicalIssuer.get(blankNode)); - } else { - dataToHash.append(issuer.issue(blankNode)); - } - } - } - - return hash(dataToHash.toString()); - } - - - /** - * Convert a quad to N-Quad format for hashing - */ - private String quadToNQuad(Statement quad, String blankNode, String replacement) { - StringBuilder sb = new StringBuilder(); - - if (StatementUtils.isBlankNode(quad.getSubject()) && StatementUtils.getBlankNodeId(quad.getSubject()).equals(blankNode)) { - sb.append(replacement); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getSubject())); - } - sb.append(SerializationConstants.SPACE); - - sb.append(StatementUtils.serializeForComparison(quad.getPredicate())); - sb.append(SerializationConstants.SPACE); - - if (StatementUtils.isBlankNode(quad.getObject()) && StatementUtils.getBlankNodeId(quad.getObject()).equals(blankNode)) { - sb.append(replacement); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getObject())); - } - - if (quad.getContext() != null) { - sb.append(SerializationConstants.SPACE); - if (StatementUtils.isBlankNode(quad.getContext()) && StatementUtils.getBlankNodeId(quad.getContext()).equals(blankNode)) { - sb.append(replacement); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getContext())); - } - } - - sb.append(SerializationConstants.SPACE_POINT); - return sb.toString(); - } - - /** - * Get related blank nodes from a quad - */ - private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode) { - Set relatedBlankNodes = new HashSet<>(); - - if (StatementUtils.isBlankNode(quad.getSubject())) { - String id = StatementUtils.getBlankNodeId(quad.getSubject()); - if (!id.equals(excludeBlankNode)) { - relatedBlankNodes.add(id); - } - } - - if (StatementUtils.isBlankNode(quad.getObject())) { - String id = StatementUtils.getBlankNodeId(quad.getObject()); - if (!id.equals(excludeBlankNode)) { - relatedBlankNodes.add(id); - } - } - - if (quad.getContext() != null && StatementUtils.isBlankNode(quad.getContext())) { - String id = StatementUtils.getBlankNodeId(quad.getContext()); - if (!id.equals(excludeBlankNode)) { - relatedBlankNodes.add(id); - } - } - - return relatedBlankNodes; - } - - /** - * Improved blank node replacement with validation - */ - private List replaceBlankNodesAndSort(List statements, Map replacementMap) { - return statements.stream() - .map(stmt -> { - Statement replaced = statementUtils.replaceBlankNodes(stmt, replacementMap); - if (replaced == null) { - throw new IllegalStateException("Failed to replace blank nodes in statement: " + stmt); - } - return replaced; - }) - .sorted(Comparator.comparing(StatementUtils::toNQuad)) - .toList(); - } - - /** - * Utility methods - removed duplicates, now using StatementUtils - */ - private String hash(String data) { - try { - String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? SerializationConstants.SHA_384 : SerializationConstants.SHA_256; - MessageDigest digest = MessageDigest.getInstance(algorithm); - byte[] hash = digest.digest(data.getBytes(StandardCharsets.UTF_8)); - return bytesToHex(hash); - } catch (NoSuchAlgorithmException e) { - throw new SerializationException("Hash algorithm not available: " + e.getMessage(), "Rdfc10CanonicalizerImpl", e); - } catch (Exception e) { - throw new SerializationException("Hash computation failed for data: " + data, "Rdfc10CanonicalizerImpl", e); - } - } - - private String bytesToHex(byte[] bytes) { - StringBuilder result = new StringBuilder(); - for (byte b : bytes) { - result.append(String.format(SerializationConstants.HEX_FORMAT, b)); - } - return result.toString(); - } - - /** - * Helper class for temporary identifier issuing during canonicalization - */ - private static class TemporaryIssuer { - private Map issued = new HashMap<>(); - private int counter = 0; - - public String issue(String identifier) { - if (!issued.containsKey(identifier)) { - issued.put(identifier, SerializationConstants.CANONICAL_BNODE_PREFIX + counter++); - } - return issued.get(identifier); - } - - public boolean hasIssued(String identifier) { - return issued.containsKey(identifier); - } - - public TemporaryIssuer copy() { - TemporaryIssuer copy = new TemporaryIssuer(); - copy.issued = new HashMap<>(this.issued); - copy.counter = this.counter; - return copy; - } - } -} diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java index d941b2fc6..c3c6f4cd5 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java @@ -108,7 +108,6 @@ private SerializationConstants() { public static final String DEFAULT_GRAPH_IRI = "http://ns.inria.fr/corese/default-graph"; public static final String C14N = "_c14n"; - public static final String SPACE_POINT = " ."; public static final String CANONICAL_BNODE_PLACEHOLDER = "<>"; public static final String HEX_FORMAT = "%02x"; diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 6eee9d223..e2f0eee44 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -111,7 +111,7 @@ public static String toNQuad(Statement statement) { sb.append(SerializationConstants.SPACE).append(serializeForComparison(statement.getContext())); } - sb.append(SerializationConstants.SPACE_POINT); + sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); return sb.toString(); } } diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java index 80d9892c3..cd128a75a 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java @@ -1,6 +1,5 @@ package fr.inria.corese.core.next.impl.io.serialization.canonical; -import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java index ac78cbe53..423170e3e 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java @@ -2,7 +2,6 @@ import fr.inria.corese.core.next.api.*; import fr.inria.corese.core.next.impl.exception.SerializationException; -import fr.inria.corese.core.next.impl.io.serialization.option.CanonicalOption; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -191,12 +190,6 @@ void testConstructorNullModel() { new CanonicalSerializer(null, defaultConfig, mockValueFactory, mockCanonicalizer)); } - @Test - @DisplayName("Constructor with null valueFactory should throw NullPointerException") - void testConstructorNullValueFactory() { - assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, defaultConfig, null, mockCanonicalizer)); - } @Test @DisplayName("Constructor with null config should throw NullPointerException") @@ -227,7 +220,7 @@ void testSerializeEmptyModel() throws SerializationException { StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); assertEquals("", writer.toString()); } @@ -241,7 +234,7 @@ void testSerializeSimpleStatement() throws SerializationException { StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = " \"literal1\" .\n"; assertEquals(expectedOutput, writer.toString()); @@ -280,7 +273,7 @@ void testSerializeWithBlankNodesAndOutputVerification() throws SerializationExce StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = """ _:c14n2 . @@ -302,7 +295,7 @@ void testSerializeWithContext() throws SerializationException { when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(stmtWithContext)); StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = " \"literal1\" .\n"; assertEquals(expectedOutput, writer.toString()); @@ -346,7 +339,7 @@ void testSerializeWithBlankNodeInContextAndOutputVerification() throws Serializa StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = " \"literal1\" _:c14n0 .\n"; assertEquals(expectedOutput, writer.toString()); @@ -384,7 +377,7 @@ void testSerializeW3CExampleWithDifferentActualOutput() throws SerializationExce StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = """ _:b0 _:b2 . @@ -417,7 +410,7 @@ protected void writeValue(Writer w, Value v) throws IOException { StringWriter writer = new StringWriter(); - noDotSerializer.serialize(writer); + noDotSerializer.write(writer); String expectedOutput = " \"literal1\"\n"; assertEquals(expectedOutput, writer.toString()); @@ -443,7 +436,7 @@ protected void writeValue(Writer w, Value v) throws IOException { StringWriter writer = new StringWriter(); - customLineEndingSerializer.serialize(writer); + customLineEndingSerializer.write(writer); String expectedOutput = " \"literal1\" .\r\n"; assertEquals(expectedOutput, writer.toString()); @@ -460,7 +453,7 @@ void testSerializeMixedStatements() throws SerializationException { when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(mixedStatements); StringWriter writer = new StringWriter(); - serializer.serialize(writer); + serializer.write(writer); String expectedOutput = """ "literal1" . @@ -470,4 +463,47 @@ void testSerializeMixedStatements() throws SerializationException { assertEquals(expectedOutput, writer.toString()); } + @Test + @DisplayName("Serialization of specific N3 input with exact expected output order") + void testSerializeSpecificN3InputWithExactOutputOrder() throws SerializationException { + // Given the specific N3 input: + // @prefix : . + // :p :q _:e0 . + // :p :q _:e1 . + // _:e0 :p _:e2 . + // _:e1 :p _:e3 . + // _:e2 :r _:e3 . + + // Mock the canonicalized output in the EXACT order you expect: + Statement expectedStmt1 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); + Statement expectedStmt2 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); + Statement expectedStmt3 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); + Statement expectedStmt4 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); + Statement expectedStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); + + List expectedCanonicalStatements = Arrays.asList( + expectedStmt1, // _:e0 :p _:e2 . + expectedStmt2, // _:e1 :p _:e3 . + expectedStmt3, // :p :q _:e0 . + expectedStmt4, // :p :q _:e1 . + expectedStmt5 // _:e2 :r _:e3 . + ); + + when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(expectedCanonicalStatements); + + StringWriter writer = new StringWriter(); + + serializer.write(writer); + + String expectedOutput = """ + _:e0 _:e2 . + _:e1 _:e3 . + _:e0 . + _:e1 . + _:e2 _:e3 . + """; + assertEquals(expectedOutput, writer.toString()); + + verify(mockCanonicalizer).canonicalize(any(Model.class)); + } } From fe7fbd9e71707bed133c1a383f9e121c9a42da8a Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 9 Sep 2025 15:56:59 +0200 Subject: [PATCH 5/6] #187 Implement RDF canonicalization as serializer --- .../DefaultSerializerFactory.java | 4 +- .../canonical/CanonicalOption.java | 42 +- .../canonical/CanonicalSerializer.java | 6 +- .../canonical/Rdfc10Canonicalizer.java | 382 +++++++++--------- .../util/SerializationConstants.java | 2 +- .../io/serialization/util/StatementUtils.java | 185 +++++++-- .../canonical/CanonicalSerializerTest.java | 266 +++--------- src/test/resources/canonical/figure2.ttl | 9 + src/test/resources/canonical/figure3.ttl | 7 + 9 files changed, 463 insertions(+), 440 deletions(-) create mode 100644 src/test/resources/canonical/figure2.ttl create mode 100644 src/test/resources/canonical/figure3.ttl diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java index 511c45830..6caebca6f 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java @@ -116,7 +116,7 @@ public DefaultSerializerFactory() { specificConfig.getPermutationLimit(), coreseValueFactory ); - return new CanonicalSerializer(model, specificConfig, coreseValueFactory, canonicalizer); + return new CanonicalSerializer(model, specificConfig, canonicalizer); } else { logger.warn("Provided config for RDFC_1_0 is not CanonicalOption (was {}). Using default CanonicalOption.", genericConfig != null ? genericConfig.getClass().getSimpleName() : "null"); @@ -126,7 +126,7 @@ public DefaultSerializerFactory() { defaultConfig.getPermutationLimit(), coreseValueFactory ); - return new CanonicalSerializer(model, defaultConfig, coreseValueFactory, canonicalizer); + return new CanonicalSerializer(model, defaultConfig, canonicalizer); } }); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java index 02b57e2d5..92955ac73 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java @@ -13,6 +13,9 @@ */ public class CanonicalOption extends AbstractSerializerOption { + /** + * Enumeration for the supported hashing algorithms. + */ public enum HashAlgorithm { SHA_256, SHA_384 @@ -24,6 +27,8 @@ public enum HashAlgorithm { /** * Protected constructor to be used by the {@link Builder}. + * It initializes a new instance of {@code CanonicalOption} with the values + * provided by the builder. * * @param builder The builder instance containing the desired configuration values. */ @@ -34,7 +39,11 @@ protected CanonicalOption(Builder builder) { this.permutationLimit = builder.permutationLimit; } - + /** + * Gets the hashing algorithm used for blank node canonicalization. + * + * @return The {@link HashAlgorithm} used. + */ public HashAlgorithm getHashAlgorithm() { return hashAlgorithm; } @@ -49,7 +58,13 @@ public int getDepthFactor() { return depthFactor; } - + /** + * Gets the permutation limit used in the canonicalization algorithm. + * This value is used to limit the number of permutations attempted during blank node canonicalization + * to prevent excessive computation time. + * + * @return The permutation limit. + */ public int getPermutationLimit() { return permutationLimit; } @@ -64,21 +79,40 @@ public static class Builder extends AbstractSerializerOption.AbstractBuilder canonicalStatements = canonicalizer.canonicalize(model); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java index a8caeb858..c0956c42f 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java @@ -15,8 +15,8 @@ /** * Implementation of the RDFC-1.0 canonicalization algorithm as specified by W3C. - * This class is responsible for deterministically re-labeling blank nodes and - * sorting all RDF statements to produce a canonical representation of a dataset. + * This class deterministically re-labels blank nodes and sorts all RDF statements + * to produce a canonical representation of a dataset. */ public class Rdfc10Canonicalizer { @@ -26,69 +26,76 @@ public class Rdfc10Canonicalizer { private int callsHashNDegreeQuads = 0; /** - * Constructs a new Rdfc10Canonicalizer. + * Constructs a new Rdfc10Canonicalizer with specified configuration. * - * @param hashAlgorithm The hashing algorithm to use, e.g., SHA-256 or SHA-384. - * @param maxCalls The maximum number of recursive calls to the Hash N-Degree Quads algorithm + * @param hashAlgorithm The hashing algorithm to use for canonicalization (SHA-256 or SHA-384). + * @param maxCalls The maximum number of recursive calls to the Hash N-Degree Quads algorithm * to prevent infinite loops on complex cyclic graphs. - * @param valueFactory The factory for creating RDF values, used by StatementUtils. + * @param valueFactory The factory for creating RDF values, used by StatementUtils for + * blank node replacement and serialization. */ public Rdfc10Canonicalizer(CanonicalOption.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { - this.hashAlgorithm = hashAlgorithm; + this.hashAlgorithm = Objects.requireNonNull(hashAlgorithm, "Hash algorithm cannot be null"); this.maxCallsHashNDegreeQuads = maxCalls; this.statementUtils = new StatementUtils(valueFactory); } - /** - * Canonicalizes all statements within a given {@link Model}. + * Canonicalizes all statements within a given Model. * This is the main entry point for the canonicalization process. + * The process involves: + * 1. Identifying all blank nodes and their associated statements. + * 2. Creating canonical identifiers for blank nodes. + * 3. Replacing original blank node IDs with canonical ones. + * 4. Sorting the resulting statements lexicographically. * - * @param model The input model to canonicalize. - * @return A list of canonicalized and sorted statements. + * @param model The input model to canonicalize. Must not be null. + * @return A list of canonicalized and sorted statements ready for serialization. + * @throws SerializationException if canonicalization fails due to algorithmic constraints + * or invalid input data. */ public List canonicalize(Model model) { + Objects.requireNonNull(model, "Model cannot be null"); return canonicalize(model.stream()); } - /** * Internal canonicalization method that processes a stream of statements. - * This method handles all the steps of the RDFC-1.0 algorithm, including: - *
    - *
  1. Creating a map of blank nodes to their associated quads.
  2. - *
  3. Generating a canonical replacement map for blank nodes using - * the Hash First Degree and Hash N-Degree Quads algorithms.
  4. - *
  5. Replacing the blank nodes in the statements.
  6. - *
  7. Sorting the final list of statements.
  8. - *
+ * This method handles all the steps of the RDFC-1.0 algorithm in sequence. + * * @param statements A stream of statements to canonicalize. * @return A list of canonicalized and sorted statements. */ private List canonicalize(Stream statements) { List stmtList = statements.toList(); + // Reset the recursive call counter for each canonicalization operation callsHashNDegreeQuads = 0; + // Step 1: Create a mapping of blank nodes to their associated statements Map> blankNodeToQuads = createBNodeToQuadsMap(stmtList); + // If no blank nodes are found, simply sort and return the original statements if (blankNodeToQuads.isEmpty()) { return stmtList.stream() - .sorted((s1, s2) -> StatementUtils.toNQuad(s1).compareTo(StatementUtils.toNQuad(s2))) + .sorted(Comparator.comparing(StatementUtils::toNQuad)) .toList(); } + // Step 2: Generate a canonical replacement mapping for blank nodes Map canonicalReplacementMap = createCanonicalMap(blankNodeToQuads); + // Step 3: Apply the replacement and sort the final statements return replaceBlankNodesAndSort(stmtList, canonicalReplacementMap); } /** - * Creates a map where each blank node identifier is a key, and its value is a set - * of all statements (quads) in which the blank node appears as a subject, object, or graph URI. + * Creates a map where each blank node identifier is associated with all statements + * (quads) in which it appears as a subject, object, or graph name. + * This is the foundation for the Hash First Degree Quads algorithm. * * @param statements The list of statements to process. - * @return A map linking blank node identifiers to their associated quads. + * @return A map linking blank node identifiers to their associated statements. */ private Map> createBNodeToQuadsMap(List statements) { Map> blankNodeToQuads = new HashMap<>(); @@ -116,53 +123,69 @@ private Map> createBNodeToQuadsMap(List statem } /** - * Performs the core canonicalization logic to create a map of blank node - * replacements. This method uses the "Hash First Degree Quads" and "Hash N-Degree Quads" - * algorithms to determine a canonical identifier for each blank node. + * Performs the core canonicalization logic to create a map of blank node replacements. + * This method implements the main flow of the RDFC-1.0 algorithm. * - * @param blankNodeToQuads A map of blank nodes to their associated quads. - * @return A map from old blank node identifiers to new canonical ones. + * @return A deterministic mapping from original blank node identifiers to canonical ones. */ - private Map createCanonicalMap(Map> blankNodeToQuads) { + private Map createCanonicalMap(Map> bnodeToQuads) { Map canonicalIssuer = new HashMap<>(); - int canonicalCounter = 0; + int counter = 0; - Map> hashToBlankNodes = new HashMap<>(); + // Step 1: Calculate first-degree hashes for all blank nodes + Map firstDegreeHashes = new HashMap<>(); + for (String bnode : bnodeToQuads.keySet()) { + String hash = hashFirstDegreeQuads(bnode, bnodeToQuads); + firstDegreeHashes.put(bnode, hash); + } - for (String blankNode : blankNodeToQuads.keySet()) { - String hash = hashFirstDegreeQuads(blankNode, blankNodeToQuads); - hashToBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(blankNode); + // Step 2: Create hash groups + Map> hashToNodes = new HashMap<>(); + for (String node : bnodeToQuads.keySet()) { + String hash = firstDegreeHashes.get(node); + hashToNodes.computeIfAbsent(hash, k -> new ArrayList<>()).add(node); } - List sortedHashes = new ArrayList<>(hashToBlankNodes.keySet()); - Collections.sort(sortedHashes); + // Step 3: Separate into single-node and multi-node groups + List singleNodeHashes = new ArrayList<>(); + List multiNodeHashes = new ArrayList<>(); + for (Map.Entry> entry : hashToNodes.entrySet()) { + if (entry.getValue().size() == 1) { + singleNodeHashes.add(entry.getKey()); + } else { + multiNodeHashes.add(entry.getKey()); + } + } - for (String hash : sortedHashes) { - Set blankNodes = hashToBlankNodes.get(hash); + // Sort hashes within their groups + Collections.sort(singleNodeHashes); + Collections.sort(multiNodeHashes); - if (blankNodes.size() == 1) { - String blankNode = blankNodes.iterator().next(); - canonicalIssuer.put(blankNode, SerializationConstants.C14N + canonicalCounter++); - } else { - Map nDegreeHashes = new HashMap<>(); - - for (String blankNode : blankNodes) { - if (!canonicalIssuer.containsKey(blankNode)) { - TemporaryIssuer temporaryIssuer = new TemporaryIssuer(); - String nDegreeHash = hashNDegreQuads(blankNode, blankNodeToQuads, canonicalIssuer, temporaryIssuer); - nDegreeHashes.put(blankNode, nDegreeHash); - } - } - - List> sortedEntries = nDegreeHashes.entrySet().stream() - .sorted(Map.Entry.comparingByValue()) - .toList(); - - for (Map.Entry entry : sortedEntries) { - if (!canonicalIssuer.containsKey(entry.getKey())) { - canonicalIssuer.put(entry.getKey(), SerializationConstants.C14N + canonicalCounter++); - } - } + // Step 4: Process single-node groups first + for (String hash : singleNodeHashes) { + String node = hashToNodes.get(hash).get(0); + canonicalIssuer.put(node, SerializationConstants.C14N + counter++); + } + + // Step 5: Process multi-node groups using N-degree hashing + for (String hash : multiNodeHashes) { + List nodes = hashToNodes.get(hash); + + Map nDegreeHashes = new HashMap<>(); + for (String node : nodes) { + TemporaryIssuer tempIssuer = new TemporaryIssuer(); + String nDegreeHash = hashNDegreeQuads(node, bnodeToQuads, canonicalIssuer, tempIssuer); + nDegreeHashes.put(node, nDegreeHash); + } + + nodes.sort((n1, n2) -> { + int cmp = nDegreeHashes.get(n1).compareTo(nDegreeHashes.get(n2)); + if (cmp != 0) return cmp; + return n1.compareTo(n2); + }); + + for (String node : nodes) { + canonicalIssuer.put(node, SerializationConstants.C14N + counter++); } } @@ -171,12 +194,12 @@ private Map createCanonicalMap(Map> blank /** * Implements the "Hash First Degree Quads" algorithm from the RDFC-1.0 specification. - * This method computes a hash for a blank node based on the canonical representation - * of all quads in which it appears, replacing the blank node itself with a placeholder. + * It computes a hash for a blank node based on canonical representations of all statements + * in which it appears. It replaces the blank node itself with a standardized placeholder. * - * @param blankNode The blank node identifier to hash. - * @param blankNodeToQuads The map of blank nodes to their associated quads. - * @return A string representing the hash. + * @param blankNode The blank node identifier to hash. + * @param blankNodeToQuads The map of blank nodes to their associated statements. + * @return A cryptographic hash representing the blank node's first-degree context. */ private String hashFirstDegreeQuads(String blankNode, Map> blankNodeToQuads) { Set quads = blankNodeToQuads.get(blankNode); @@ -189,118 +212,109 @@ private String hashFirstDegreeQuads(String blankNode, Map Collections.sort(nquads); String toHash = String.join(SerializationConstants.EMPTY_STRING, nquads); + return hash(toHash); } /** - * Implements the "Hash N-Degree Quads" algorithm from the RDFC-1.0 specification. - * This is a recursive algorithm that resolves permutations of blank nodes with - * the same first-degree hash by considering their related blank nodes and recursively - * hashing their graph contexts. + * Implements the "Hash N-Degree Quads" algorithm for resolving blank node permutations. + * This recursive method handles cases where multiple blank nodes have identical + * first-degree hashes by considering their relationships to other blank nodes. * - * @param identifier The blank node identifier to hash. - * @param blankNodeToQuads The map of blank nodes to their associated quads. - * @param canonicalIssuer The map of blank nodes that have already been assigned a canonical ID. - * @param issuer The temporary identifier issuer for the current permutation path. - * @return A string representing the hash. - * @throws SerializationException if the maximum number of recursive calls is exceeded. + * @param identifier The blank node identifier currently being processed. + * @param blankNodeToQuads The map of blank nodes to their associated statements. + * @param canonicalIssuer Map of already-assigned canonical identifiers. + * @param issuer Temporary identifier issuer for the current recursion path. + * @return A hash representing the N-degree context of the blank node. + * @throws SerializationException if the maximum recursion depth is exceeded. */ - private String hashNDegreQuads(String identifier, Map> blankNodeToQuads, - Map canonicalIssuer, TemporaryIssuer issuer) { + private String hashNDegreeQuads(String identifier, Map> blankNodeToQuads, + Map canonicalIssuer, TemporaryIssuer issuer) { if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { - throw new SerializationException("Maximum calls to Hash N-Degree Quads exceeded: " + maxCallsHashNDegreeQuads, "Rdfc10Canonicalizer"); + throw new SerializationException( + "Maximum calls to Hash N-Degree Quads exceeded: " + maxCallsHashNDegreeQuads, + "Rdfc10Canonicalizer" + ); } - Map> hashToRelatedBlankNodes = new HashMap<>(); - Set quads = blankNodeToQuads.get(identifier); - - for (Statement quad : quads) { - Set relatedBlankNodes = getRelatedBlankNodes(quad, identifier); - - for (String relatedBlankNode : relatedBlankNodes) { - String hash; - if (canonicalIssuer.containsKey(relatedBlankNode)) { - hash = canonicalIssuer.get(relatedBlankNode); - } else if (issuer.hasIssued(relatedBlankNode)) { - hash = issuer.issue(relatedBlankNode); - } else { - hash = hashFirstDegreeQuads(relatedBlankNode, blankNodeToQuads); - } - hashToRelatedBlankNodes.computeIfAbsent(hash, k -> new HashSet<>()).add(relatedBlankNode); - } + // Collect all related blank nodes from all quads containing this node + Set relatedBlankNodes = new HashSet<>(); + for (Statement quad : blankNodeToQuads.get(identifier)) { + relatedBlankNodes.addAll(getRelatedBlankNodes(quad, identifier)); } - StringBuilder dataToHash = new StringBuilder(); - List sortedHashes = new ArrayList<>(hashToRelatedBlankNodes.keySet()); - Collections.sort(sortedHashes); - - for (String hash : sortedHashes) { - dataToHash.append(hash); - Set blankNodeList = hashToRelatedBlankNodes.get(hash); - - if (blankNodeList.size() > 1) { - List hashPathList = new ArrayList<>(); - - for (String relatedBlankNode : blankNodeList) { - if (canonicalIssuer.containsKey(relatedBlankNode)) { - hashPathList.add(canonicalIssuer.get(relatedBlankNode)); - } else { - TemporaryIssuer tempIssuer = issuer.copy(); - tempIssuer.issue(relatedBlankNode); - String hashPath = hashNDegreQuads(relatedBlankNode, blankNodeToQuads, canonicalIssuer, tempIssuer); - hashPathList.add(hashPath); - } - } - - Collections.sort(hashPathList); - dataToHash.append(String.join(SerializationConstants.EMPTY_STRING, hashPathList)); + // Calculate hashes for each related blank node + List relatedHashes = new ArrayList<>(); + for (String relatedNode : relatedBlankNodes) { + String relatedHash; + + if (canonicalIssuer.containsKey(relatedNode)) { + // Use canonical ID if already assigned + relatedHash = canonicalIssuer.get(relatedNode); + } else if (issuer.hasIssued(relatedNode)) { + // Use temporary ID if already issued + relatedHash = issuer.issue(relatedNode); } else { - String blankNode = blankNodeList.iterator().next(); - if (canonicalIssuer.containsKey(blankNode)) { - dataToHash.append(canonicalIssuer.get(blankNode)); - } else { - dataToHash.append(issuer.issue(blankNode)); - } + // Recursively calculate N-degree hash + TemporaryIssuer newIssuer = issuer.copy(); + relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer); } + + relatedHashes.add(relatedHash); } - return hash(dataToHash.toString()); - } + // Sort the related hashes + Collections.sort(relatedHashes); + // Build the final hash input + StringBuilder hashInput = new StringBuilder(); + hashInput.append(hashFirstDegreeQuads(identifier, blankNodeToQuads)); + for (String relatedHash : relatedHashes) { + hashInput.append(relatedHash); + } + + return hash(hashInput.toString()); + } /** - * Converts a single quad to a canonical N-Quad string representation for hashing, - * replacing the specified blank node with a placeholder. + * Converts a statement to canonical N-Quad format for hashing, replacing + * a specific blank node with a placeholder string. * - * @param quad The statement to convert. - * @param blankNode The blank node to replace. - * @param replacement The placeholder string to use for the blank node. - * @return A canonical N-Quad string. + * @param quad The statement to convert. + * @param blankNodeToReplace The blank node identifier to replace. + * @param replacement The placeholder string to use for replacement. + * @return A canonical N-Quad string with placeholder substitution. */ - private String quadToNQuad(Statement quad, String blankNode, String replacement) { + private String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { StringBuilder sb = new StringBuilder(); - if (StatementUtils.isBlankNode(quad.getSubject()) && StatementUtils.getBlankNodeId(quad.getSubject()).equals(blankNode)) { - sb.append(replacement); + // Handle subject + if (StatementUtils.isBlankNode(quad.getSubject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { sb.append(StatementUtils.serializeForComparison(quad.getSubject())); } sb.append(SerializationConstants.SPACE); - sb.append(StatementUtils.serializeForComparison(quad.getPredicate())); - sb.append(SerializationConstants.SPACE); + // Predicate + sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); - if (StatementUtils.isBlankNode(quad.getObject()) && StatementUtils.getBlankNodeId(quad.getObject()).equals(blankNode)) { - sb.append(replacement); + // Handle object + if (StatementUtils.isBlankNode(quad.getObject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { sb.append(StatementUtils.serializeForComparison(quad.getObject())); } + // Handle context if (quad.getContext() != null) { sb.append(SerializationConstants.SPACE); - if (StatementUtils.isBlankNode(quad.getContext()) && StatementUtils.getBlankNodeId(quad.getContext()).equals(blankNode)) { - sb.append(replacement); + if (StatementUtils.isBlankNode(quad.getContext())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { sb.append(StatementUtils.serializeForComparison(quad.getContext())); } @@ -311,16 +325,17 @@ private String quadToNQuad(Statement quad, String blankNode, String replacement) } /** - * Finds all blank nodes in a given quad that are related to (but not the same as) - * the excluded blank node. + * Identifies all blank nodes in a statement that are related to but different from + * a specified blank node. This is used to explore the graph context during N-degree hashing. * - * @param quad The quad to inspect. + * @param quad The statement to examine. * @param excludeBlankNode The blank node to exclude from the results. - * @return A set of blank node identifiers related to the excluded blank node. + * @return A set of blank node identifiers related to the excluded node. */ private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode) { Set relatedBlankNodes = new HashSet<>(); + // Check subject position if (StatementUtils.isBlankNode(quad.getSubject())) { String id = StatementUtils.getBlankNodeId(quad.getSubject()); if (!id.equals(excludeBlankNode)) { @@ -328,6 +343,7 @@ private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode } } + // Check object position if (StatementUtils.isBlankNode(quad.getObject())) { String id = StatementUtils.getBlankNodeId(quad.getObject()); if (!id.equals(excludeBlankNode)) { @@ -335,6 +351,7 @@ private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode } } + // Check context position if (quad.getContext() != null && StatementUtils.isBlankNode(quad.getContext())) { String id = StatementUtils.getBlankNodeId(quad.getContext()); if (!id.equals(excludeBlankNode)) { @@ -346,44 +363,44 @@ private Set getRelatedBlankNodes(Statement quad, String excludeBlankNode } /** - * Replaces the old blank node identifiers in a list of statements with their new - * canonical identifiers and then sorts the resulting statements. + * Replaces blank node identifiers in statements and sorts them lexicographically. + * This is the final step of the canonicalization process. * - * @param statements The list of statements to process. - * @param replacementMap The map from old blank node IDs to new canonical IDs. - * @return A sorted list of statements with canonical blank node IDs. + * @param statements The original statements to process. + * @param canonicalMap The map of blank node replacements. + * @return A sorted list of statements with canonical blank node identifiers. */ - private List replaceBlankNodesAndSort(List statements, Map replacementMap) { - return statements.stream() - .map(stmt -> { - Statement replaced = statementUtils.replaceBlankNodes(stmt, replacementMap); - if (replaced == null) { - throw new IllegalStateException("Failed to replace blank nodes in statement: " + stmt); - } - return replaced; - }) + private List replaceBlankNodesAndSort(List statements, Map canonicalMap) { + + List replaced = statements.stream() + .map(stmt -> statementUtils.replaceBlankNodes(stmt, canonicalMap)) + .toList(); + + return replaced.stream() .sorted(Comparator.comparing(StatementUtils::toNQuad)) .toList(); } /** - * Computes a cryptographic hash of the given data string using the configured - * hash algorithm (SHA-256 or SHA-384). + * Computes a cryptographic hash of the input data using the configured algorithm. * - * @param data The data string to hash. + * @param data The string data to hash. * @return A hexadecimal string representation of the hash. - * @throws SerializationException if the hash algorithm is not available or if hashing fails. + * @throws SerializationException if the hash algorithm is unavailable. */ private String hash(String data) { try { - String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? SerializationConstants.SHA_384 : SerializationConstants.SHA_256; + String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? + SerializationConstants.SHA_384 : SerializationConstants.SHA_256; MessageDigest digest = MessageDigest.getInstance(algorithm); byte[] hash = digest.digest(data.getBytes(StandardCharsets.UTF_8)); return bytesToHex(hash); } catch (NoSuchAlgorithmException e) { - throw new SerializationException("Hash algorithm not available: " + e.getMessage(), "Rdfc10Canonicalizer", e); + throw new SerializationException("Hash algorithm not available: " + e.getMessage(), + "Rdfc10Canonicalizer", e); } catch (Exception e) { - throw new SerializationException("Hash computation failed for data: " + data, "Rdfc10Canonicalizer", e); + throw new SerializationException("Hash computation failed for data: " + data, + "Rdfc10Canonicalizer", e); } } @@ -402,41 +419,40 @@ private String bytesToHex(byte[] bytes) { } /** - * Helper class for temporary identifier issuing during canonicalization. - * This is used during the recursive "Hash N-Degree Quads" algorithm to - * assign unique, temporary blank node identifiers within a single path. + * Helper class for managing temporary identifiers during recursive hashing. + * It ensures that each exploration path maintains independent temporary labeling + * to avoid contamination between different permutation explorations. */ private static class TemporaryIssuer { private Map issued = new HashMap<>(); private int counter = 0; /** - * Issues a new temporary identifier for the given blank node identifier. - * If an identifier has already been issued for this blank node, it returns the existing one. + * Issues a temporary identifier for a blank node. + * If the node already has a temporary ID, it returns the existing one. * * @param identifier The blank node identifier to issue an ID for. * @return A temporary canonical identifier. */ public String issue(String identifier) { - if (!issued.containsKey(identifier)) { - issued.put(identifier, SerializationConstants.CANONICAL_BNODE_PREFIX + counter++); - } - return issued.get(identifier); + return issued.computeIfAbsent(identifier, k -> SerializationConstants.CANONICAL_BNODE_PREFIX + counter++); } /** - * Checks if a temporary identifier has already been issued for the given blank node. + * Checks if a temporary identifier has been issued for a blank node. + * * @param identifier The blank node identifier to check. - * @return {@code true} if an identifier has been issued, {@code false} otherwise. + * @return true if a temporary ID exists, false otherwise. */ public boolean hasIssued(String identifier) { return issued.containsKey(identifier); } /** - * Creates a copy of the current TemporaryIssuer instance. This is crucial for - * the recursive hashing algorithm to explore different permutation paths independently. - * @return A new instance with the same state. + * Creates an independent copy of this TemporaryIssuer. + * This is crucial for maintaining path isolation during recursive exploration. + * + * @return A new TemporaryIssuer instance with the same state. */ public TemporaryIssuer copy() { TemporaryIssuer copy = new TemporaryIssuer(); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java index c3c6f4cd5..6e4ae0098 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/SerializationConstants.java @@ -107,7 +107,7 @@ private SerializationConstants() { public static final String DEFAULT_GRAPH_IRI = "http://ns.inria.fr/corese/default-graph"; - public static final String C14N = "_c14n"; + public static final String C14N = "c14n"; public static final String CANONICAL_BNODE_PLACEHOLDER = "<>"; public static final String HEX_FORMAT = "%02x"; diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index e2f0eee44..6d385f238 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -7,25 +7,38 @@ /** * Utility class for handling Statement manipulation during RDFC-1.0 canonicalization. * This class provides methods to create new statements with replaced blank node identifiers - * and to serialize them for comparison and hashing. + * and to serialize them for comparison and hashing according to the RDFC-1.0 specification. + *

+ * Key functionalities: + * - Replacement of blank node identifiers with canonical IDs + * - Serialization of RDF values for lexicographic comparison + * - Conversion of statements to N-Quads format + * - Blank node identification and manipulation */ public class StatementUtils { private final ValueFactory valueFactory; + /** + * Constructs a new StatementUtils instance. + * + * @param valueFactory The factory for creating RDF values, used for creating new statements + * with replaced blank node identifiers. + */ public StatementUtils(ValueFactory valueFactory) { this.valueFactory = valueFactory; } /** * Creates a new statement with blank nodes replaced according to the canonical mapping. + * This method is used during the canonicalization process to replace original blank node + * identifiers with their canonical counterparts. * - * @param originalStatement The original statement - * @param canonicalMapping Map from original blank node IDs to canonical IDs - * @return A new statement with replaced blank node identifiers + * @param originalStatement The original statement containing blank nodes to be replaced. + * @param canonicalMapping A map from original blank node IDs to canonical IDs. + * @return A new statement with blank node identifiers replaced according to the mapping. */ public Statement replaceBlankNodes(Statement originalStatement, Map canonicalMapping) { - Resource newSubject = replaceIfBlankNodeResource(originalStatement.getSubject(), canonicalMapping); IRI newPredicate = originalStatement.getPredicate(); Value newObject = replaceIfBlankNodeValue(originalStatement.getObject(), canonicalMapping); @@ -34,84 +47,182 @@ public Statement replaceBlankNodes(Statement originalStatement, Map mapping) { - if (original != null && isBlankNode(original)) { - String canonicalId = mapping.getOrDefault(getBlankNodeId(original), getBlankNodeId(original)); - return valueFactory.createBNode(canonicalId); + String originalId = getBlankNodeId(original); + String canonicalId = mapping.get(originalId); + if (canonicalId != null) { + return valueFactory.createBNode(canonicalId); + } } return original; } + /** + * Replaces a blank node Value with its canonical identifier if it exists in the mapping. + * If the value is not a blank node or has no mapping, returns the original value unchanged. + * + * @param original The original Value to potentially replace. + * @param mapping The canonical mapping from original to canonical blank node IDs. + * @return The replaced Value or the original if no replacement is needed. + */ private Value replaceIfBlankNodeValue(Value original, Map mapping) { - if (original != null && isBlankNode(original)) { - String canonicalId = mapping.getOrDefault(getBlankNodeId(original), getBlankNodeId(original)); - return valueFactory.createBNode(canonicalId); + String originalId = getBlankNodeId(original); + String canonicalId = mapping.get(originalId); + if (canonicalId != null) { + return valueFactory.createBNode(canonicalId); + } } return original; } /** - * Checks if a value is a blank node. + * Checks if a given Value is a blank node. + * Blank nodes are anonymous resources that don't have a URI identifier. * - * @param value The value to check. - * @return True if the value is a blank node, false otherwise. + * @param value The Value to check. + * @return true if the value is a blank node, false otherwise. */ public static boolean isBlankNode(Value value) { return value != null && value.isBNode(); } /** - * Gets the identifier string for a blank node. + * Extracts the identifier string from a blank node Value. + * For blank nodes, this returns the local identifier without the ":_" prefix. * - * @param value The blank node value. - * @return The string identifier. + * @param value The blank node Value from which to extract the identifier. + * @return The blank node identifier string, or null if the value is not a blank node. */ public static String getBlankNodeId(Value value) { - return value.stringValue(); + if (value == null) return null; + if (isBlankNode(value)) { + String str = value.stringValue(); + if (str.startsWith(SerializationConstants.BNODE_PREFIX)) { + return str.substring(2); + } + return str; + } + return null; } /** - * Converts a value to a string for lexicographic comparison, as defined by RDFC-1.0. + * Serializes a Value for lexicographic comparison according to RDFC-1.0 specifications. + * This method produces a string representation suitable for deterministic sorting and hashing. * - * @param value The value to convert. - * @return The N-Quads representation for comparison. + * @param value The Value to serialize. + * @return A string representation of the value for comparison purposes. */ public static String serializeForComparison(Value value) { - if (value == null) return SerializationConstants.EMPTY_STRING; - String valueStr = value.stringValue(); + if (value == null) { + return SerializationConstants.EMPTY_STRING; + } + + if (value instanceof IRI) { + IRI iri = (IRI) value; + String uri = iri.stringValue(); - if (value.isBNode()) { - return valueStr; + return SerializationConstants.LT + uri + SerializationConstants.GT; } - if (value.isIRI()) { - return SerializationConstants.LT + valueStr + SerializationConstants.GT; + + if (value instanceof BNode) { + return serializeBNode((BNode) value); + } + + if (value instanceof Literal) { + return serializeLiteral((Literal) value); } - return SerializationConstants.QUOTE + valueStr + SerializationConstants.QUOTE; + return value.toString(); } + /** - * Converts a statement to N-Quads format for lexicographic comparison. - * This uses a simplified serialization for comparison purposes only. + * Serializes a blank node for comparison. + * Blank nodes are serialized with the ":_" prefix followed by their identifier. * - * @param statement The statement to convert - * @return The N-Quads representation + * @param bnode The blank node to serialize. + * @return The serialized blank node string. + */ + private static String serializeBNode(BNode bnode) { + return SerializationConstants.BNODE_PREFIX + bnode.getID(); + } + + /** + * Serializes a literal for comparison according to RDFC-1.0 specifications. + * Handles string escaping, datatypes, and language tags appropriately. + * + * @param literal The literal to serialize. + * @return The serialized literal string. + */ + private static String serializeLiteral(Literal literal) { + StringBuilder sb = new StringBuilder(); + + // Escape special characters in the literal label + String escapedLabel = literal.getLabel() + .replace(SerializationConstants.BACK_SLASH, "\\\\") + .replace(SerializationConstants.QUOTE, "\\\""); + + sb.append('"').append(escapedLabel).append('"'); + + // Handle datatype or language tag + if (literal.getDatatype() != null) { + String datatypeUri = literal.getDatatype().stringValue(); + // Omit xsd:string datatype for brevity (implied by default) + if (!"http://www.w3.org/2001/XMLSchema#string".equals(datatypeUri)) { + sb.append(SerializationConstants.DATATYPE_SEPARATOR).append(serializeForComparison(literal.getDatatype())); + } + } else if (literal.getLanguage() != null) { + sb.append(SerializationConstants.AT_SIGN).append(literal.getLanguage()); + } + + return sb.toString(); + } + + /** + * Converts a Statement to N-Quads format for lexicographic comparison. + * This produces a canonical string representation suitable for sorting and hashing + * according to the RDFC-1.0 specification. + * + * @param statement The statement to convert. + * @return The N-Quads representation of the statement. */ public static String toNQuad(Statement statement) { + if (statement == null) { + return SerializationConstants.EMPTY_STRING; + } + StringBuilder sb = new StringBuilder(); - sb.append(serializeForComparison(statement.getSubject())).append(SerializationConstants.SPACE); - sb.append(serializeForComparison(statement.getPredicate())).append(SerializationConstants.SPACE); + // Serialize subject, predicate, and object + sb.append(serializeForComparison(statement.getSubject())) + .append(SerializationConstants.SPACE); + sb.append(serializeForComparison(statement.getPredicate())) + .append(SerializationConstants.SPACE); sb.append(serializeForComparison(statement.getObject())); + // Serialize context (graph) if present if (statement.getContext() != null) { - sb.append(SerializationConstants.SPACE).append(serializeForComparison(statement.getContext())); + sb.append(SerializationConstants.SPACE) + .append(serializeForComparison(statement.getContext())); } - sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); + // Terminate with space and period + sb.append(SerializationConstants.SPACE) + .append(SerializationConstants.POINT); + return sb.toString(); } -} + + +} \ No newline at end of file diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java index 423170e3e..0e843f84d 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java @@ -1,7 +1,14 @@ package fr.inria.corese.core.next.impl.io.serialization.canonical; import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.api.base.io.RDFFormat; +import fr.inria.corese.core.next.api.io.parser.RDFParser; +import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.impl.exception.SerializationException; +import fr.inria.corese.core.next.impl.io.parser.ParserFactory; +import fr.inria.corese.core.next.impl.io.serialization.DefaultSerializerFactory; +import fr.inria.corese.core.next.impl.temp.CoreseAdaptedValueFactory; +import fr.inria.corese.core.next.impl.temp.CoreseModel; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -9,12 +16,10 @@ import org.mockito.MockitoAnnotations; import java.io.IOException; +import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; -import java.util.Arrays; import java.util.Collections; -import java.util.List; -import java.util.Random; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; @@ -29,8 +34,6 @@ class CanonicalSerializerTest { @Mock private Model mockModel; @Mock - private ValueFactory mockValueFactory; - @Mock private Rdfc10Canonicalizer mockCanonicalizer; @Mock private BNode mockBNodeE0; @@ -88,7 +91,7 @@ void setUp() { setupBasicMocks(); - serializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, mockCanonicalizer) { + serializer = new CanonicalSerializer(mockModel, defaultConfig, mockCanonicalizer) { @Override protected void writeValue(Writer w, Value v) throws IOException { if (v != null) { @@ -187,7 +190,7 @@ void testConstructorWithValidParameters() { @DisplayName("Constructor with null model should throw NullPointerException") void testConstructorNullModel() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(null, defaultConfig, mockValueFactory, mockCanonicalizer)); + new CanonicalSerializer(null, defaultConfig, mockCanonicalizer)); } @@ -195,20 +198,20 @@ void testConstructorNullModel() { @DisplayName("Constructor with null config should throw NullPointerException") void testConstructorNullConfig() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, null, mockValueFactory, mockCanonicalizer)); + new CanonicalSerializer(mockModel, null, mockCanonicalizer)); } @Test @DisplayName("Constructor with null canonicalizer should throw NullPointerException") void testConstructorNullCanonicalizer() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, null)); + new CanonicalSerializer(mockModel, defaultConfig, null)); } @Test @DisplayName("Constructor with default configuration") void testConstructorWithDefaultConfig() { - CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, defaultConfig, mockValueFactory, mockCanonicalizer); + CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, defaultConfig, mockCanonicalizer); assertNotNull(defaultSerializer); assertEquals("RDFC-1.0", defaultSerializer.getFormatName()); } @@ -242,50 +245,6 @@ void testSerializeSimpleStatement() throws SerializationException { verify(mockCanonicalizer).canonicalize(any(Model.class)); } - @Test - @DisplayName("Serialization with blank nodes - W3C canonicalization and output sorting") - void testSerializeWithBlankNodesAndOutputVerification() throws SerializationException { - - Statement inputStmt1 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); - Statement inputStmt2 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); - Statement inputStmt3 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); - Statement inputStmt4 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); - Statement inputStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); - - List originalStatementsFromModel = Arrays.asList(inputStmt1, inputStmt2, inputStmt3, inputStmt4, inputStmt5); - Collections.shuffle(originalStatementsFromModel, new Random(0)); - - Statement canonicalOutputStmt1 = createMockStatement(mockIRIP, mockIRIQ, canonicalBNodeC2, null); - Statement canonicalOutputStmt2 = createMockStatement(mockIRIP, mockIRIQ, canonicalBNodeC3, null); - Statement canonicalOutputStmt3 = createMockStatement(canonicalBNodeC0, mockIRIR, canonicalBNodeC1, null); - Statement canonicalOutputStmt4 = createMockStatement(canonicalBNodeC2, mockIRIP, canonicalBNodeC1, null); - Statement canonicalOutputStmt5 = createMockStatement(canonicalBNodeC3, mockIRIP, canonicalBNodeC0, null); - - List expectedCanonicalStatementsSorted = Arrays.asList( - canonicalOutputStmt1, - canonicalOutputStmt2, - canonicalOutputStmt3, - canonicalOutputStmt4, - canonicalOutputStmt5 - ); - - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(expectedCanonicalStatementsSorted); - - StringWriter writer = new StringWriter(); - - serializer.write(writer); - - String expectedOutput = """ - _:c14n2 . - _:c14n3 . - _:c14n0 _:c14n1 . - _:c14n2 _:c14n1 . - _:c14n3 _:c14n0 . - """; - assertEquals(expectedOutput, writer.toString()); - - verify(mockCanonicalizer).canonicalize(any(Model.class)); - } @Test @DisplayName("Serialization with context (named graph)") @@ -328,182 +287,69 @@ void testWriteContextWithNonNullContext() throws IOException { assertEquals(expectedOutput, writer.toString()); } - @Test - @DisplayName("Serialization with blank nodes in context - canonicalization and sorting") - void testSerializeWithBlankNodeInContextAndOutputVerification() throws SerializationException { - - Statement canonicalOutputStmt1 = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, canonicalBNodeC0); + @DisplayName("Test serialization with figure3.ttl") + void testSerializeFigure3() { + String canonicalOutput = serializeToRdfCanonical("/canonical/figure3.ttl"); - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(canonicalOutputStmt1)); + assertNotNull(canonicalOutput, "Canonical output should not be null"); + assertFalse(canonicalOutput.isEmpty(), "Canonical output should not be empty"); + String actual = canonicalOutput.trim().replace("\r\n", "\n"); + String expected = " _:c14n2 .\n" + + " _:c14n3 .\n" + + "_:c14n0 _:c14n1 .\n" + + "_:c14n2 _:c14n1 .\n" + + "_:c14n3 _:c14n0 ."; - StringWriter writer = new StringWriter(); - - serializer.write(writer); - - String expectedOutput = " \"literal1\" _:c14n0 .\n"; - assertEquals(expectedOutput, writer.toString()); + assertEquals(expected, actual, "Canonical output should match expected format"); - verify(mockCanonicalizer).canonicalize(any(Model.class)); } - @Test - void testSerializeW3CExampleWithDifferentActualOutput() throws SerializationException { - Statement inputStmt1 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); - Statement inputStmt2 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); - Statement inputStmt3 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); - Statement inputStmt4 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); - Statement inputStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); - - List originalStatementsFromModel = Arrays.asList(inputStmt1, inputStmt2, inputStmt3, inputStmt4, inputStmt5); - Collections.shuffle(originalStatementsFromModel, new Random(0)); - - Statement actualOutputStmt1 = createMockStatement(actualBNodeB0, mockIRIR, actualBNodeB2, null); - Statement actualOutputStmt2 = createMockStatement(actualBNodeB1, mockIRIP, actualBNodeB0, null); - Statement actualOutputStmt3 = createMockStatement(actualBNodeB3, mockIRIP, actualBNodeB2, null); - Statement actualOutputStmt4 = createMockStatement(mockIRIP, mockIRIQ, actualBNodeB1, null); - Statement actualOutputStmt5 = createMockStatement(mockIRIP, mockIRIQ, actualBNodeB3, null); - - - List actualCanonicalStatementsSorted = Arrays.asList( - actualOutputStmt1, - actualOutputStmt2, - actualOutputStmt3, - actualOutputStmt4, - actualOutputStmt5 - ); - - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(actualCanonicalStatementsSorted); - - StringWriter writer = new StringWriter(); - - serializer.write(writer); - - String expectedOutput = """ - _:b0 _:b2 . - _:b1 _:b0 . - _:b3 _:b2 . - _:b1 . - _:b3 . - """; - assertEquals(expectedOutput, writer.toString()); - - verify(mockCanonicalizer).canonicalize(any(Model.class)); - } @Test - @DisplayName("Serialization without trailing dot") - void testSerializeNoTrailingDot() throws SerializationException { - CanonicalOption noDotConfig = CanonicalOption.builder().trailingDot(false).build(); - CanonicalSerializer noDotSerializer = new CanonicalSerializer(mockModel, noDotConfig, mockValueFactory, mockCanonicalizer) { - @Override - protected void writeValue(Writer w, Value v) throws IOException { - if (v != null) { - w.write(v.stringValue()); - } - } - }; + @DisplayName("Test serialization with figure2.ttl") + void testSerializeFigure2() { + String canonicalOutput = serializeToRdfCanonical("/canonical/figure2.ttl"); - Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); + assertNotNull(canonicalOutput, "Canonical output should not be null"); + assertFalse(canonicalOutput.isEmpty(), "Canonical output should not be empty"); - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(simpleStmt)); + String actual = canonicalOutput.trim().replace("\r\n", "\n"); - StringWriter writer = new StringWriter(); + String expected = " _:c14n0 .\n" + + " _:c14n1 .\n" + + "_:c14n0 .\n" + + "_:c14n1 ."; - noDotSerializer.write(writer); - - String expectedOutput = " \"literal1\"\n"; - assertEquals(expectedOutput, writer.toString()); - verify(mockCanonicalizer).canonicalize(any(Model.class)); + assertEquals(expected, actual, "Canonical output should match RDFC-1.0 specification"); } - @Test - @DisplayName("Serialization with different line ending") - void testSerializeDifferentLineEnding() throws SerializationException { - CanonicalOption customLineEndingConfig = CanonicalOption.builder().lineEnding("\r\n").build(); - CanonicalSerializer customLineEndingSerializer = new CanonicalSerializer(mockModel, customLineEndingConfig, mockValueFactory, mockCanonicalizer) { - @Override - protected void writeValue(Writer w, Value v) throws IOException { - if (v != null) { - w.write(v.stringValue()); - } - } - }; - - Statement simpleStmt = createMockStatement(mockIRI1, mockIRI2, mockLiteral1, null); - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(Collections.singletonList(simpleStmt)); + private String serializeToRdfCanonical(String resourcePath) { + Model model = new CoreseModel(); + ValueFactory valueFactory = new CoreseAdaptedValueFactory(); - StringWriter writer = new StringWriter(); + ParserFactory parserFactory = new ParserFactory(); + RDFParser parser = parserFactory.createRDFParser(RDFFormat.TURTLE, model, valueFactory); - customLineEndingSerializer.write(writer); - - String expectedOutput = " \"literal1\" .\r\n"; - assertEquals(expectedOutput, writer.toString()); - } - - @Test - @DisplayName("Serialization with a mix of statements (with and without context)") - void testSerializeMixedStatements() throws SerializationException { - Statement stmt1 = createMockStatement(mockIRI1, mockIRIP, mockLiteral1, null); - Statement stmt2 = createMockStatement(mockIRI2, mockIRIQ, mockLiteral2, mockIRI1); - Statement stmt3 = createMockStatement(mockIRI1, mockIRIR, mockLiteral2, null); - - List mixedStatements = Arrays.asList(stmt1, stmt2, stmt3); - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(mixedStatements); - - StringWriter writer = new StringWriter(); - serializer.write(writer); - - String expectedOutput = """ - "literal1" . - "literal2" . - "literal2" . - """; - assertEquals(expectedOutput, writer.toString()); - } - - @Test - @DisplayName("Serialization of specific N3 input with exact expected output order") - void testSerializeSpecificN3InputWithExactOutputOrder() throws SerializationException { - // Given the specific N3 input: - // @prefix : . - // :p :q _:e0 . - // :p :q _:e1 . - // _:e0 :p _:e2 . - // _:e1 :p _:e3 . - // _:e2 :r _:e3 . - - // Mock the canonicalized output in the EXACT order you expect: - Statement expectedStmt1 = createMockStatement(mockBNodeE0, mockIRIP, mockBNodeE2, null); - Statement expectedStmt2 = createMockStatement(mockBNodeE1, mockIRIP, mockBNodeE3, null); - Statement expectedStmt3 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE0, null); - Statement expectedStmt4 = createMockStatement(mockIRIP, mockIRIQ, mockBNodeE1, null); - Statement expectedStmt5 = createMockStatement(mockBNodeE2, mockIRIR, mockBNodeE3, null); - - List expectedCanonicalStatements = Arrays.asList( - expectedStmt1, // _:e0 :p _:e2 . - expectedStmt2, // _:e1 :p _:e3 . - expectedStmt3, // :p :q _:e0 . - expectedStmt4, // :p :q _:e1 . - expectedStmt5 // _:e2 :r _:e3 . + try (InputStream inputStream = getClass().getResourceAsStream(resourcePath)) { + if (inputStream == null) { + fail("Resource not found: " + resourcePath); + } + parser.parse(inputStream); + } catch (IOException e) { + fail("Failed to parse resource: " + resourcePath + " - " + e.getMessage()); + } + + DefaultSerializerFactory serializerFactory = new DefaultSerializerFactory(); + RDFSerializer serializer = serializerFactory.createSerializer( + RDFFormat.RDFC_1_0, + model, + CanonicalOption.defaultConfig() ); - when(mockCanonicalizer.canonicalize(any(Model.class))).thenReturn(expectedCanonicalStatements); - StringWriter writer = new StringWriter(); - serializer.write(writer); - - String expectedOutput = """ - _:e0 _:e2 . - _:e1 _:e3 . - _:e0 . - _:e1 . - _:e2 _:e3 . - """; - assertEquals(expectedOutput, writer.toString()); - - verify(mockCanonicalizer).canonicalize(any(Model.class)); + return writer.toString(); } } diff --git a/src/test/resources/canonical/figure2.ttl b/src/test/resources/canonical/figure2.ttl new file mode 100644 index 000000000..874449ecd --- /dev/null +++ b/src/test/resources/canonical/figure2.ttl @@ -0,0 +1,9 @@ +@prefix ns1: . + +ns1:p ns1:q _:b0 ; + ns1:r _:b1 . + +_:b1 ns1:t ns1:u . + +_:b0 ns1:s ns1:u . + diff --git a/src/test/resources/canonical/figure3.ttl b/src/test/resources/canonical/figure3.ttl new file mode 100644 index 000000000..04e449b2a --- /dev/null +++ b/src/test/resources/canonical/figure3.ttl @@ -0,0 +1,7 @@ +@prefix : . + +:p :q _:e0 . +:p :q _:e1 . +_:e0 :p _:e2 . +_:e1 :p _:e3 . +_:e2 :r _:e3 . From 4bf4317fa50d490550833163bff9d57a4a3e8388 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Wed, 10 Sep 2025 13:42:53 +0200 Subject: [PATCH 6/6] #187 Implement RDF canonicalization as serializer --- .../DefaultSerializerFactory.java | 12 +++++------ .../canonical/Rdfc10Canonicalizer.java | 6 +++--- ...anonicalOption.java => Rdfc10Options.java} | 18 ++++++++--------- ...lSerializer.java => Rdfc10Serializer.java} | 8 +++----- .../DefaultSerializerFactoryTest.java | 6 +++--- ...OptionTest.java => Rdfc10OptionsTest.java} | 10 +++++----- ...zerTest.java => Rdfc10SerializerTest.java} | 20 +++++++++---------- 7 files changed, 39 insertions(+), 41 deletions(-) rename src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/{CanonicalOption.java => Rdfc10Options.java} (87%) rename src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/{CanonicalSerializer.java => Rdfc10Serializer.java} (93%) rename src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/{CanonicalOptionTest.java => Rdfc10OptionsTest.java} (90%) rename src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/{CanonicalSerializerTest.java => Rdfc10SerializerTest.java} (94%) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java index 6caebca6f..c7d918c03 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactory.java @@ -6,9 +6,9 @@ import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; import fr.inria.corese.core.next.api.io.serialization.SerializerFactory; -import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalOption; -import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Canonicalizer; +import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Options; +import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Serializer; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsOption; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesOption; @@ -110,23 +110,23 @@ public DefaultSerializerFactory() { }); tempRegistry.put(RDFFormat.RDFC_1_0, (model, genericConfig) -> { - if (genericConfig instanceof CanonicalOption specificConfig) { + if (genericConfig instanceof Rdfc10Options specificConfig) { Rdfc10Canonicalizer canonicalizer = new Rdfc10Canonicalizer( specificConfig.getHashAlgorithm(), specificConfig.getPermutationLimit(), coreseValueFactory ); - return new CanonicalSerializer(model, specificConfig, canonicalizer); + return new Rdfc10Serializer(model, specificConfig, canonicalizer); } else { logger.warn("Provided config for RDFC_1_0 is not CanonicalOption (was {}). Using default CanonicalOption.", genericConfig != null ? genericConfig.getClass().getSimpleName() : "null"); - CanonicalOption defaultConfig = CanonicalOption.defaultConfig(); + Rdfc10Options defaultConfig = Rdfc10Options.defaultConfig(); Rdfc10Canonicalizer canonicalizer = new Rdfc10Canonicalizer( defaultConfig.getHashAlgorithm(), defaultConfig.getPermutationLimit(), coreseValueFactory ); - return new CanonicalSerializer(model, defaultConfig, canonicalizer); + return new Rdfc10Serializer(model, defaultConfig, canonicalizer); } }); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java index c0956c42f..f534d9b38 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Canonicalizer.java @@ -20,7 +20,7 @@ */ public class Rdfc10Canonicalizer { - private final CanonicalOption.HashAlgorithm hashAlgorithm; + private final Rdfc10Options.HashAlgorithm hashAlgorithm; private final int maxCallsHashNDegreeQuads; private final StatementUtils statementUtils; private int callsHashNDegreeQuads = 0; @@ -34,7 +34,7 @@ public class Rdfc10Canonicalizer { * @param valueFactory The factory for creating RDF values, used by StatementUtils for * blank node replacement and serialization. */ - public Rdfc10Canonicalizer(CanonicalOption.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { + public Rdfc10Canonicalizer(Rdfc10Options.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { this.hashAlgorithm = Objects.requireNonNull(hashAlgorithm, "Hash algorithm cannot be null"); this.maxCallsHashNDegreeQuads = maxCalls; this.statementUtils = new StatementUtils(valueFactory); @@ -390,7 +390,7 @@ private List replaceBlankNodesAndSort(List statements, Map */ private String hash(String data) { try { - String algorithm = hashAlgorithm == CanonicalOption.HashAlgorithm.SHA_384 ? + String algorithm = hashAlgorithm == Rdfc10Options.HashAlgorithm.SHA_384 ? SerializationConstants.SHA_384 : SerializationConstants.SHA_256; MessageDigest digest = MessageDigest.getInstance(algorithm); byte[] hash = digest.digest(data.getBytes(StandardCharsets.UTF_8)); diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Options.java similarity index 87% rename from src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java rename to src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Options.java index 92955ac73..6936fa988 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOption.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Options.java @@ -11,7 +11,7 @@ * Use the {@link Builder} class to create instances of {@code CanonicalOption}. * A predefined default configuration is available via {@link #defaultConfig()}. */ -public class CanonicalOption extends AbstractSerializerOption { +public class Rdfc10Options extends AbstractSerializerOption { /** * Enumeration for the supported hashing algorithms. @@ -32,7 +32,7 @@ public enum HashAlgorithm { * * @param builder The builder instance containing the desired configuration values. */ - protected CanonicalOption(Builder builder) { + protected Rdfc10Options(Builder builder) { super(builder); this.hashAlgorithm = builder.hashAlgorithm; this.depthFactor = builder.depthFactor; @@ -70,7 +70,7 @@ public int getPermutationLimit() { } /** - * Public Builder for {@link CanonicalOption}. + * Public Builder for {@link Rdfc10Options}. * Provides a fluent API for constructing {@code CanonicalOption} instances with default values * specific to the Canonical RDF format. */ @@ -88,13 +88,13 @@ public Builder() { } /** - * Builds a new {@link CanonicalOption} instance with the configured values. + * Builds a new {@link Rdfc10Options} instance with the configured values. * * @return A new instance of {@code CanonicalOption}. */ @Override - public CanonicalOption build() { - return new CanonicalOption(this); + public Rdfc10Options build() { + return new Rdfc10Options(this); } } @@ -103,7 +103,7 @@ public CanonicalOption build() { * * @return A new {@code CanonicalOption} with default settings. */ - public static CanonicalOption defaultConfig() { + public static Rdfc10Options defaultConfig() { return new Builder().build(); } @@ -113,7 +113,7 @@ public static CanonicalOption defaultConfig() { * * @return A new {@code Builder} instance. */ - public static CanonicalOption.Builder builder() { - return new CanonicalOption.Builder(); + public static Rdfc10Options.Builder builder() { + return new Rdfc10Options.Builder(); } } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Serializer.java similarity index 93% rename from src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java rename to src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Serializer.java index 42012071e..5487f890a 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10Serializer.java @@ -3,8 +3,6 @@ import fr.inria.corese.core.next.api.Model; import fr.inria.corese.core.next.api.Resource; import fr.inria.corese.core.next.api.Statement; -import fr.inria.corese.core.next.api.ValueFactory; -import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.impl.exception.SerializationException; import fr.inria.corese.core.next.impl.io.serialization.base.AbstractLineBasedSerializer; import fr.inria.corese.core.next.impl.io.serialization.util.SerializationConstants; @@ -24,9 +22,9 @@ * This implementation now acts as a wrapper, preparing the model for a dedicated * RDFC-1.0 canonicalization component and then writing the resulting canonical statements. */ -public class CanonicalSerializer extends AbstractLineBasedSerializer { +public class Rdfc10Serializer extends AbstractLineBasedSerializer { - private final CanonicalOption config; + private final Rdfc10Options config; private final Rdfc10Canonicalizer canonicalizer; private final Model model; @@ -38,7 +36,7 @@ public class CanonicalSerializer extends AbstractLineBasedSerializer { * @param config The configuration options for the canonicalization process. * @param canonicalizer The canonicalizer component to use. */ - public CanonicalSerializer(Model model, CanonicalOption config, Rdfc10Canonicalizer canonicalizer) { + public Rdfc10Serializer(Model model, Rdfc10Options config, Rdfc10Canonicalizer canonicalizer) { super(model, config); this.model = Objects.requireNonNull(model); this.config = Objects.requireNonNull(config); diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java index 111a7b998..598900ad6 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/DefaultSerializerFactoryTest.java @@ -4,7 +4,7 @@ import fr.inria.corese.core.next.api.base.io.RDFFormat; import fr.inria.corese.core.next.api.io.serialization.RDFSerializer; import fr.inria.corese.core.next.api.io.serialization.SerializationOption; -import fr.inria.corese.core.next.impl.io.serialization.canonical.CanonicalSerializer; +import fr.inria.corese.core.next.impl.io.serialization.canonical.Rdfc10Serializer; import fr.inria.corese.core.next.impl.io.serialization.nquads.NQuadsSerializer; import fr.inria.corese.core.next.impl.io.serialization.ntriples.NTriplesSerializer; import fr.inria.corese.core.next.impl.io.serialization.rdfxml.XmlSerializer; @@ -102,11 +102,11 @@ void createSerializer_shouldReturnXmlSerializer_forRdfXmlFormat() { @Test @DisplayName("createSerializer should return CanonicalSerializer for CANONICAL_RDF format") void createSerializer_shouldReturnCanonicalSerializer_forCanonicalRdfFormat() { - try (MockedConstruction mockedConstruction = mockConstruction(CanonicalSerializer.class)) { + try (MockedConstruction mockedConstruction = mockConstruction(Rdfc10Serializer.class)) { RDFSerializer serializer = factory.createSerializer(RDFFormat.RDFC_1_0, mockModel, mockConfig); assertNotNull(serializer); - assertTrue(serializer instanceof CanonicalSerializer); + assertTrue(serializer instanceof Rdfc10Serializer); assertEquals(1, mockedConstruction.constructed().size(), "CanonicalSerializer constructor should be called once"); } } diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10OptionsTest.java similarity index 90% rename from src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java rename to src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10OptionsTest.java index cd128a75a..a59214f30 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalOptionTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10OptionsTest.java @@ -6,16 +6,16 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Unit tests for the {@link CanonicalOption} class. + * Unit tests for the {@link Rdfc10Options} class. * This class verifies the default configuration and the builder functionality * for the Canonical RDF serialization options. */ -class CanonicalOptionTest { +class Rdfc10OptionsTest { @Test @DisplayName("defaultConfig should return an instance with expected default values") void defaultConfig_shouldReturnExpectedValues() { - CanonicalOption config = CanonicalOption.defaultConfig(); + Rdfc10Options config = Rdfc10Options.defaultConfig(); assertNotNull(config, "Default config should not be null"); assertTrue(config.isStrictMode(), "Default strictMode should be true for canonicalization"); @@ -28,7 +28,7 @@ void defaultConfig_shouldReturnExpectedValues() { @Test @DisplayName("builder should allow setting custom options") void builder_shouldAllowCustomOptions() { - CanonicalOption customConfig = CanonicalOption.builder() + Rdfc10Options customConfig = Rdfc10Options.builder() .strictMode(false) .validateURIs(false) .escapeUnicode(false) @@ -47,7 +47,7 @@ void builder_shouldAllowCustomOptions() { @Test @DisplayName("builder should use default values for un-set options") void builder_shouldUseDefaultValues_forUnsetOptions() { - CanonicalOption config = CanonicalOption.builder() + Rdfc10Options config = Rdfc10Options.builder() .strictMode(false) .build(); diff --git a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10SerializerTest.java similarity index 94% rename from src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java rename to src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10SerializerTest.java index 0e843f84d..b80d08651 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/CanonicalSerializerTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/io/serialization/canonical/Rdfc10SerializerTest.java @@ -29,7 +29,7 @@ * These tests verify that the serializer correctly delegates to an RDFC-1.0 canonicalization * component and formats the resulting canonical statements. */ -class CanonicalSerializerTest { +class Rdfc10SerializerTest { @Mock private Model mockModel; @@ -81,17 +81,17 @@ class CanonicalSerializerTest { private Literal mockLiteral2; - private CanonicalSerializer serializer; - private CanonicalOption defaultConfig; + private Rdfc10Serializer serializer; + private Rdfc10Options defaultConfig; @BeforeEach void setUp() { MockitoAnnotations.openMocks(this); - defaultConfig = CanonicalOption.defaultConfig(); + defaultConfig = Rdfc10Options.defaultConfig(); setupBasicMocks(); - serializer = new CanonicalSerializer(mockModel, defaultConfig, mockCanonicalizer) { + serializer = new Rdfc10Serializer(mockModel, defaultConfig, mockCanonicalizer) { @Override protected void writeValue(Writer w, Value v) throws IOException { if (v != null) { @@ -190,7 +190,7 @@ void testConstructorWithValidParameters() { @DisplayName("Constructor with null model should throw NullPointerException") void testConstructorNullModel() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(null, defaultConfig, mockCanonicalizer)); + new Rdfc10Serializer(null, defaultConfig, mockCanonicalizer)); } @@ -198,20 +198,20 @@ void testConstructorNullModel() { @DisplayName("Constructor with null config should throw NullPointerException") void testConstructorNullConfig() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, null, mockCanonicalizer)); + new Rdfc10Serializer(mockModel, null, mockCanonicalizer)); } @Test @DisplayName("Constructor with null canonicalizer should throw NullPointerException") void testConstructorNullCanonicalizer() { assertThrows(NullPointerException.class, () -> - new CanonicalSerializer(mockModel, defaultConfig, null)); + new Rdfc10Serializer(mockModel, defaultConfig, null)); } @Test @DisplayName("Constructor with default configuration") void testConstructorWithDefaultConfig() { - CanonicalSerializer defaultSerializer = new CanonicalSerializer(mockModel, defaultConfig, mockCanonicalizer); + Rdfc10Serializer defaultSerializer = new Rdfc10Serializer(mockModel, defaultConfig, mockCanonicalizer); assertNotNull(defaultSerializer); assertEquals("RDFC-1.0", defaultSerializer.getFormatName()); } @@ -345,7 +345,7 @@ private String serializeToRdfCanonical(String resourcePath) { RDFSerializer serializer = serializerFactory.createSerializer( RDFFormat.RDFC_1_0, model, - CanonicalOption.defaultConfig() + Rdfc10Options.defaultConfig() ); StringWriter writer = new StringWriter();