Skip to content

Commit ee566fc

Browse files
authored
Merge pull request #229 from corese-stack/feature/155_RDFa_parser
Feature/155 rdfa parser
2 parents 8d3a732 + 35fd285 commit ee566fc

20 files changed

Lines changed: 1152 additions & 23 deletions

File tree

build.gradle.kts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ dependencies {
139139
implementation("com.typesafe.akka:akka-stream_2.13:2.6.20") // Akka Streams for reactive streams processing
140140
implementation("com.lightbend.akka:akka-stream-alpakka-xml_2.13:3.0.4") // Alpakka XML for XML processing with Akka Streams
141141

142+
// HTML parsing for RDFa
143+
implementation("org.jsoup:jsoup:1.21.2")
144+
142145
// === Utilities ===
143146
implementation("org.apache.commons:commons-text:1.13.1") // Text manipulation utilities (Commons Text)
144147
implementation("org.json:json:20250517") // JSON processing
@@ -150,6 +153,9 @@ dependencies {
150153
testRuntimeOnly("org.junit.platform:junit-platform-launcher:1.13.2") // JUnit platform launcher (runtime)
151154
testImplementation("org.mockito:mockito-core:5.18.0") // Mockito core for mocking in tests
152155
testImplementation("org.mockito:mockito-junit-jupiter:5.18.0") // Mockito integration with JUnit Jupiter
156+
testRuntimeOnly("org.apache.logging.log4j:log4j-core:2.25.0") // Log4j2 core for internal logging
157+
testRuntimeOnly("org.apache.logging.log4j:log4j-slf4j2-impl:2.25.0") // SLF4J binding for Log4j2 (runtime)
158+
153159
}
154160

155161
/////////////////////////

src/main/java/fr/inria/corese/core/next/api/base/io/RDFFormat.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ public class RDFFormat extends FileFormat {
2323
true,
2424
false);
2525

26-
2726
public static final RDFFormat NTRIPLES = new RDFFormat(
2827
"N-Triples",
2928
List.of("nt"),
@@ -66,6 +65,13 @@ public class RDFFormat extends FileFormat {
6665
false,
6766
true);
6867

68+
public static final RDFFormat RDFa = new RDFFormat(
69+
"RDFa",
70+
List.of("html", "xhtml"),
71+
List.of("text/html", "application/xhtml+xml"),
72+
true,
73+
false);
74+
6975
/**
7076
* Constructs a new RDF format.
7177
*
@@ -152,7 +158,7 @@ public static Optional<RDFFormat> byMimeType(String mimeType) {
152158
* @return An unmodifiable List of all RdfFormat constants.
153159
*/
154160
public static List<RDFFormat> all() {
155-
return List.of(TURTLE, NTRIPLES, NQUADS, JSONLD, RDFXML, TRIG);
161+
return List.of(TURTLE, NTRIPLES, NQUADS, JSONLD, RDFXML, TRIG, RDFC_1_0, RDFa);
156162
}
157163

158164
@Override

src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
import fr.inria.corese.core.next.impl.common.util.IRIUtils;
55
import fr.inria.corese.core.next.impl.exception.IncorrectFormatException;
66

7+
import java.io.Serial;
8+
79
/**
810
* Base class for IRI implementations. Includes base functionality for IRI
911
* handling.
1012
*/
1113
public abstract class AbstractIRI implements IRI, Comparable<IRI> {
1214

15+
@Serial
1316
private static final long serialVersionUID = -1005683238501772511L;
1417

1518
private final String namespace;
@@ -44,11 +47,6 @@ protected AbstractIRI(String namespace, String localName) {
4447
this.localName = localName;
4548
}
4649

47-
@Override
48-
public boolean isIRI() {
49-
return true;
50-
}
51-
5250
@Override
5351
public String getNamespace() {
5452
return this.namespace;
@@ -86,4 +84,9 @@ public int hashCode() {
8684
hash = 31 * hash + (this.localName == null ? 0 : this.localName.hashCode());
8785
return hash;
8886
}
87+
88+
@Override
89+
public String toString() {
90+
return this.stringValue();
91+
}
8992
}

src/main/java/fr/inria/corese/core/next/api/base/model/AbstractModel.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,8 @@ public boolean containsAll(Collection<?> collection) {
334334
Iterator<?> iterator = collection.iterator();
335335
try {
336336
while (iterator.hasNext()) {
337-
if (!contains(iterator.next())) {
337+
Object currentObject = iterator.next();
338+
if (! (currentObject instanceof Statement) && ! this.contains(currentObject)) {
338339
return false;
339340
}
340341
}
@@ -610,4 +611,18 @@ private void closeIterator(Collection<?> collection, Iterator<?> iterator) {
610611
}
611612
}
612613

614+
@Override
615+
public boolean equals(Object o) {
616+
return o instanceof Model model && this.size() == model.size() && model.containsAll(this);
617+
}
618+
619+
@Override
620+
public int hashCode() {
621+
int hash = 13;
622+
for (Statement stat : this) {
623+
hash = 31 * hash + stat.hashCode();
624+
}
625+
return hash;
626+
}
627+
613628
}

src/main/java/fr/inria/corese/core/next/api/base/model/literal/AbstractLiteral.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,35 @@ public TemporalAmount temporalAmountValue() {
165165
public XMLGregorianCalendar calendarValue() {
166166
throw new IncorrectOperationException("Cannot convert to XML calendar");
167167
}
168+
169+
/**
170+
* Check if two temporal literals are equal.
171+
* @param obj the object to compare with
172+
* @return true if compareTo returns 0, false otherwise
173+
*/
174+
@Override
175+
public boolean equals(Object obj) {
176+
if(obj == this) {
177+
return true;
178+
}
179+
if(!(obj instanceof Literal)) {
180+
return false;
181+
}
182+
183+
return ((Literal) obj).getLabel().equals(this.getLabel()) && ((Literal) obj).getDatatype().equals(this.datatype);
184+
}
185+
186+
@Override
187+
public int hashCode() {
188+
int hash = 7;
189+
hash = 31 * hash + (this.getLabel() == null ? 0 : this.getLabel().hashCode());
190+
hash = 31 * hash + (this.getDatatype() == null ? 0 : this.getDatatype().hashCode());
191+
hash = 31 * hash + (this.getLanguage().isEmpty() ? 0 : this.getLanguage().get().hashCode());
192+
return hash;
193+
}
194+
195+
@Override
196+
public String toString() {
197+
return this.stringValue();
198+
}
168199
}

src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,14 @@
1414
*/
1515
public class IRIUtils {
1616

17-
private static final Pattern IRI_PATTERN = Pattern.compile("^(?<namespace>(?<protocol>[\\w\\-]+):(?<dblSlashes>\\/\\/)?(?<domain>([\\w\\-_:@]+\\.)*[\\w\\-_:]*))((?<path>\\/([\\w\\-\\._\\:]+\\/)*)(?<finalPath>[\\w\\-\\._\\:]+)?(?<query>\\?[\\w\\-_\\:\\?\\=]+)?(\\#)?(?<fragment>([\\w\\-_]+))?)?$");
17+
private static final Pattern IRI_PATTERN = Pattern.compile("^(?<namespace>" +
18+
"(?<protocol>[\\w\\-]+):(?<dblSlashes>\\/\\/)?" +
19+
"(?<domain>([\\w\\-_:@]+\\.)*[\\w\\-_:]*))" +
20+
"((?<path>\\/([\\w\\-\\._\\:]+\\/)*)" +
21+
"(?<finalPath>[\\w\\-\\._\\:]+)?" +
22+
"(?<query>\\?[\\w\\-_\\:\\?\\=]+)?" +
23+
"(?<anchor>(\\#))?" +
24+
"(?<fragment>([\\w\\-_]+))?)?$");
1825
private static final Pattern STANDARD_IRI_PATTERN = Pattern.compile("^(([^:/?#\\s]+):)(\\/\\/([^/?#\\s]*))?([^?#\\s]*)(\\?([^#\\s]*))?(#(.*))?");
1926
private static final int MAX_IRI_LENGTH = 2048;
2027
private static final long REGEX_TIMEOUT_MS = 100;
@@ -52,9 +59,10 @@ public static String guessNamespace(String iri) {
5259
if(matcher.group("path") != null) {
5360
namespace.append(matcher.group("path"));
5461
}
55-
if(matcher.group("fragment") != null && matcher.group("finalPath") != null) {
62+
if((matcher.group("fragment") != null || matcher.group("anchor") != null) && matcher.group("finalPath") != null) {
5663
namespace.append(matcher.group("finalPath")).append("#");
5764
}
65+
5866
return namespace.toString();
5967
} else {
6068
throw new IllegalStateException("No namespace found for the given IRI: " + iri + ".");

src/main/java/fr/inria/corese/core/next/impl/io/parser/ParserFactory.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import fr.inria.corese.core.next.impl.io.parser.jsonld.JSONLDParser;
1010
import fr.inria.corese.core.next.impl.io.parser.nquads.NQuadsParser;
1111
import fr.inria.corese.core.next.impl.io.parser.ntriples.NTriplesParser;
12+
import fr.inria.corese.core.next.impl.io.parser.rdfa.RDFaParser;
1213
import fr.inria.corese.core.next.impl.io.parser.rdfxml.RDFXMLParser;
1314
import fr.inria.corese.core.next.impl.io.parser.turtle.TurtleParser;
1415
import fr.inria.corese.core.next.impl.io.parser.trig.TriGParser;
@@ -52,6 +53,8 @@ public RDFParser createRDFParser(RDFFormat format, Model model, ValueFactory fac
5253
return new TriGParser(model, factory, config);
5354
} else if(format == RDFFormat.RDFC_1_0) {
5455
return new NQuadsParser(model, factory, config);
56+
} else if (format == RDFFormat.RDFa) {
57+
return new RDFaParser(model, factory, config);
5558
}
5659
throw new IllegalArgumentException("Unsupported format: " + format);
5760
}
@@ -77,6 +80,8 @@ public RDFParser createRDFParser(RDFFormat format, Model model, ValueFactory fac
7780
return new RDFXMLParser(model, factory);
7881
} else if (format == RDFFormat.TRIG) {
7982
return new TriGParser(model, factory);
83+
} else if (format == RDFFormat.RDFa) {
84+
return new RDFaParser(model, factory);
8085
}
8186
throw new IllegalArgumentException("Unsupported format: " + format);
8287
}
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
package fr.inria.corese.core.next.impl.io.parser.rdfa;
2+
3+
import fr.inria.corese.core.next.api.IRI;
4+
import fr.inria.corese.core.next.api.Resource;
5+
import fr.inria.corese.core.next.api.Value;
6+
import fr.inria.corese.core.next.impl.io.parser.rdfa.model.RDFaIncompleteStatement;
7+
8+
import java.util.*;
9+
10+
/**
11+
* This class is to be used during the evaluation of an HTML file to generate triples during the DOM traversal.
12+
* @see <a href="https://www.w3.org/TR/rdfa-syntax/#sec_5.2.">RDFa recommandation<a/>
13+
*/
14+
public class RDFaEvaluationContext {
15+
16+
/**
17+
* This will usually be the URL of the document being processed, but it could be some other URL, set by some other mechanism, such as the XHTML base element. The important thing is that it establishes a URL against which relative paths can be resolved.
18+
*/
19+
private IRI baseIri;
20+
21+
/**
22+
* The initial value will be the same as the initial value of [base], but it will usually change during the course of processing.
23+
*/
24+
private Resource parentSubjectResource ;
25+
26+
/**
27+
* In some situations the object of a statement becomes the subject of any nested statements, and this property is used to convey this value. Note that this value may be a bnode, since in some situations a number of nested statements are grouped together on one bnode. This means that the bnode must be set in the containing statement and passed down, and this property is used to convey this value.
28+
*/
29+
private Resource parentObjectResource = null;
30+
31+
/**
32+
* An index of locally defined IRI prefixes
33+
*/
34+
private Map<String, IRI> uriMappings = new HashMap<>();
35+
36+
/**
37+
* Set of statement in the process of building.
38+
*/
39+
private Set<RDFaIncompleteStatement> incompleteStatement = new HashSet<>();
40+
41+
/**
42+
* The language of the document. Note that there is no default language.
43+
*/
44+
private String language = null;
45+
46+
public RDFaEvaluationContext(IRI baseIri) {
47+
this.baseIri = baseIri;
48+
this.parentSubjectResource = baseIri;
49+
}
50+
51+
public RDFaEvaluationContext(IRI baseIri, IRI parentSubjectResource) {
52+
this.baseIri = baseIri;
53+
this.parentSubjectResource = parentSubjectResource;
54+
}
55+
56+
public RDFaEvaluationContext(RDFaEvaluationContext context) {
57+
this.baseIri = context.baseIri;
58+
this.parentSubjectResource = context.parentSubjectResource;
59+
this.parentObjectResource = context.parentObjectResource;
60+
this.uriMappings = new HashMap<>(context.uriMappings);
61+
this.incompleteStatement = new HashSet<>(context.incompleteStatement);
62+
this.language = context.language;
63+
}
64+
65+
public IRI baseIri() {
66+
return baseIri;
67+
}
68+
69+
public RDFaEvaluationContext baseIri(IRI baseIri) {
70+
this.baseIri = baseIri;
71+
return this;
72+
}
73+
74+
public RDFaEvaluationContext incompleteStatements(Set<RDFaIncompleteStatement> incompleteStatement) {
75+
this.incompleteStatement = new HashSet<>(incompleteStatement);
76+
return this;
77+
}
78+
79+
public Iterator<RDFaIncompleteStatement> getIncompleteStatementIterator() {
80+
return this.incompleteStatement.iterator();
81+
}
82+
83+
public RDFaEvaluationContext addStatementWithoutSubject(IRI property, Value object) {
84+
RDFaIncompleteStatement newStatement = new RDFaIncompleteStatement(property);
85+
newStatement.setObject(object);
86+
this.incompleteStatement.add(newStatement);
87+
return this;
88+
}
89+
90+
public RDFaEvaluationContext addStatementWithoutObject(Resource subject, IRI property) {
91+
RDFaIncompleteStatement newStatement = new RDFaIncompleteStatement(property);
92+
newStatement.setSubject(subject);
93+
this.incompleteStatement.add(newStatement);
94+
return this;
95+
}
96+
97+
public void clearIncompleteStatements() {
98+
this.incompleteStatement.clear();
99+
}
100+
101+
public Resource parentSubjectResource() {
102+
return parentSubjectResource;
103+
}
104+
105+
public RDFaEvaluationContext parentSubjectResource(Resource parentSubjectResource) {
106+
this.parentSubjectResource = parentSubjectResource;
107+
return this;
108+
}
109+
110+
public Resource parentObjectResource() {
111+
return parentObjectResource;
112+
}
113+
114+
public RDFaEvaluationContext parentObjectResource(Resource parentObjectResource) {
115+
this.parentObjectResource = parentObjectResource;
116+
return this;
117+
}
118+
119+
public Map<String, IRI> uriMappings() {
120+
return uriMappings;
121+
}
122+
123+
public RDFaEvaluationContext uriMappings(Map<String, IRI> uriMappings) {
124+
this.uriMappings = uriMappings;
125+
return this;
126+
}
127+
128+
public boolean hasUriMapping(String prefix) {
129+
return this.uriMappings.containsKey(prefix);
130+
}
131+
132+
/**
133+
* @param prefix the prefix WITHOUT ":"
134+
* @return the IRI associated to the prefix in this context
135+
*/
136+
public IRI uriMapping(String prefix) {
137+
return this.uriMappings.get(prefix);
138+
}
139+
140+
public void addUriMapping(String prefix, IRI prefixIri) {
141+
this.uriMappings.put(prefix, prefixIri);
142+
}
143+
144+
@Override
145+
public String toString() {
146+
StringBuilder sb = new StringBuilder();
147+
148+
sb.append("BaseURI: ").append(this.baseIri.stringValue()).append(" ");
149+
sb.append("Mappings: [");
150+
this.uriMappings.forEach((key, value) -> sb.append("(").append(key).append(", ").append(value.stringValue()).append(") "));
151+
sb.append("] ");
152+
if(this.parentSubjectResource != null) {
153+
sb.append("Subject:").append(this.parentSubjectResource.stringValue()).append(" ");
154+
} else {
155+
sb.append("Subject:").append((Object) null).append(" ");
156+
}
157+
if(this.parentObjectResource != null) {
158+
sb.append("Object: ").append(this.parentObjectResource.stringValue()).append(" ");
159+
} else {
160+
sb.append("Object: ").append((Object) null).append(" ");
161+
}
162+
if(! this.incompleteStatement.isEmpty()) {
163+
sb.append(this.incompleteStatement.size()).append(" incomplete statements.");
164+
}
165+
166+
return sb.toString();
167+
}
168+
169+
public String getLanguage() {
170+
return language;
171+
}
172+
173+
public void setLanguage(String language) {
174+
this.language = language;
175+
}
176+
}

0 commit comments

Comments
 (0)