Skip to content

Commit f3d8efe

Browse files
abdessamad-abdounMaillPierre
authored andcommitted
[#212-REVUE] Implement standard tests for Canonical RDF into Corese-W3C
1 parent 7adc61a commit f3d8efe

File tree

4 files changed

+85
-15
lines changed

4 files changed

+85
-15
lines changed

src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ public abstract class AbstractIRI implements IRI, Comparable<IRI> {
2222
* @throws IncorrectFormatException if the IRI format is incorrect
2323
*/
2424
protected AbstractIRI(String fullIRI) {
25-
if (fullIRI == null) {
26-
throw new IllegalArgumentException("fullIRI cannot be null");
27-
}
2825
if (!IRIUtils.isStandardIRI(fullIRI)) {
2926
throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI");
3027
}

src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@
99

1010
/**
1111
* Utility class for IRI.
12-
*
12+
* <p>
1313
* Intended to facilitate string manipulation related to IRI.
1414
*/
1515
public class IRIUtils {
1616

1717
private static final Pattern IRI_PATTERN = Pattern.compile("^(?<namespace>(?<protocol>[\\w\\-]+):(?<dblSlashes>\\/\\/)?(?<domain>([\\w\\-_:@]+\\.)*[\\w\\-_:]*))((?<path>\\/([\\w\\-\\._\\:]+\\/)*)(?<finalPath>[\\w\\-\\._\\:]+)?(?<query>\\?[\\w\\-_\\:\\?\\=]+)?(\\#)?(?<fragment>([\\w\\-_]+))?)?$");
1818
private static final Pattern STANDARD_IRI_PATTERN = Pattern.compile("^(([^:/?#\\s]+):)(\\/\\/([^/?#\\s]*))?([^?#\\s]*)(\\?([^#\\s]*))?(#(.*))?");
19+
private static final Pattern RELATIVE_IRI_PATTERN = Pattern.compile("^[^\\s\\p{Cc}]+$");
1920
private static final int MAX_IRI_LENGTH = 2048;
2021
private static final long REGEX_TIMEOUT_MS = 100;
2122

@@ -28,6 +29,7 @@ private IRIUtils() {
2829

2930
/**
3031
* Guesses the namespace of an IRI using a regex pattern.
32+
*
3133
* @param iri The IRI string to be processed.
3234
* @return the guessed namespace of the IRI or an empty string if no match is found.
3335
*/
@@ -38,21 +40,22 @@ public static String guessNamespace(String iri) {
3840
try {
3941
Matcher matcher = matchWithTimeout(IRI_PATTERN, iri);
4042
if (matcher == null || !matcher.matches()) {
41-
return "";
43+
return iri.endsWith("#") ? iri : (iri.contains("#") ? iri.substring(0, iri.lastIndexOf("#") + 1) : iri);
44+
4245
} else if (matcher.matches()) {
4346
if (matcher.group("protocol") != null && matcher.group("protocol").equals("_")) {
4447
return "";
4548
}
4649
StringBuilder namespace = new StringBuilder();
4750
namespace.append(matcher.group("protocol")).append(":");
48-
if(matcher.group("dblSlashes") != null) {
51+
if (matcher.group("dblSlashes") != null) {
4952
namespace.append(matcher.group("dblSlashes"));
5053
}
5154
namespace.append(matcher.group("domain"));
52-
if(matcher.group("path") != null) {
55+
if (matcher.group("path") != null) {
5356
namespace.append(matcher.group("path"));
5457
}
55-
if(matcher.group("fragment") != null && matcher.group("finalPath") != null) {
58+
if (matcher.group("fragment") != null && matcher.group("finalPath") != null) {
5659
namespace.append(matcher.group("finalPath")).append("#");
5760
}
5861
return namespace.toString();
@@ -66,6 +69,7 @@ public static String guessNamespace(String iri) {
6669

6770
/**
6871
* Guesses the local name of an IRI using a regex pattern.
72+
*
6973
* @param iri The IRI string to be processed.
7074
* @return the guessed local name of the IRI or an empty string if no match is found.
7175
*/
@@ -76,11 +80,11 @@ public static String guessLocalName(String iri) {
7680
try {
7781
Matcher matcher = matchWithTimeout(IRI_PATTERN, iri);
7882
if (matcher == null || !matcher.matches()) {
79-
return "";
83+
return iri;
8084
} else if (matcher.matches()) {
81-
if(matcher.group("fragment") != null){ // If the IRI has a fragment
85+
if (matcher.group("fragment") != null) { // If the IRI has a fragment
8286
return matcher.group("fragment");
83-
} else if(matcher.group("finalPath") != null ) { // If the IRI has no fragment but do not ends with a slash
87+
} else if (matcher.group("finalPath") != null) { // If the IRI has no fragment but do not ends with a slash
8488
return matcher.group("finalPath");
8589
} else { // If the URI ends with a slash
8690
return "";
@@ -95,6 +99,8 @@ public static String guessLocalName(String iri) {
9599

96100
/**
97101
* Checks if the given string is a valid IRI using a regex pattern extracted from the W3C standards.
102+
* Removes leading/trailing whitespace and non-breaking spaces before validation.
103+
*
98104
* @param iriString The string to be checked.
99105
* @return true if the string is a valid IRI, false otherwise.
100106
*/
@@ -103,7 +109,49 @@ public static boolean isStandardIRI(String iriString) {
103109
return false;
104110
}
105111

112+
// Remove leading whitespace and U+00A0 (non-breaking space)
113+
int start = 0;
114+
while (start < iriString.length()) {
115+
char c = iriString.charAt(start);
116+
if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) {
117+
start++;
118+
} else {
119+
break;
120+
}
121+
}
122+
123+
// Remove trailing whitespace and U+00A0 (non-breaking space)
124+
int end = iriString.length();
125+
while (end > start) {
126+
char c = iriString.charAt(end - 1);
127+
if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) {
128+
end--;
129+
} else {
130+
break;
131+
}
132+
}
133+
134+
iriString = iriString.substring(start, end);
135+
136+
if (iriString.isEmpty()) {
137+
return false;
138+
}
139+
140+
// Reject IRIs with internal whitespace
141+
for (char c : iriString.toCharArray()) {
142+
if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) {
143+
return false;
144+
}
145+
}
146+
106147
try {
148+
// If no scheme (no :), treat as relative IRI
149+
if (!iriString.contains(":") || iriString.startsWith("#")) {
150+
Matcher matcher = matchWithTimeout(RELATIVE_IRI_PATTERN, iriString);
151+
return matcher != null && matcher.matches();
152+
}
153+
154+
// If scheme present, validate as absolute IRI
107155
Matcher matcher = matchWithTimeout(STANDARD_IRI_PATTERN, iriString);
108156
if (matcher != null && matcher.matches()) {
109157
return isValidURI(iriString);
@@ -114,6 +162,7 @@ public static boolean isStandardIRI(String iriString) {
114162
}
115163
}
116164

165+
117166
/**
118167
* Executes regex matching with timeout protection.
119168
*/

src/test/java/fr/inria/corese/core/next/impl/common/util/IRIUtilsTest.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ public class IRIUtilsTest {
2323

2424
// Array of strings that should be recognized as correct IRIs. Some of them taken from the official IRI documentation.
2525
private static final String[] correctARIs = { uriSchema, uriWithFragment, uriWithQuery, uriWithPort, uriWithPortAndQuery, uriWithPortAndQueryAndFragment, uriWithPortAndFragment, uriToHTMLPage, uriToHTMLPageWithQuery, uriToHTMLPageWithQueryAndFragment, uriToHTMLPageWithFragment, "ftp://ftp.is.co.za/rfc/rfc1808.txt", "http://www.ietf.org/rfc/rfc2396.txt", "ldap://[2001:db8::7]/c=GB?objectClass?one", "mailto:John.Doe@example.com", "news:comp.infosystems.www.servers.unix", "tel:+1-816-555-1212", "telnet://192.0.2.16:80/", "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", "http://foo.co.uk/", "http://regexr.com/foo.html?q=bar" };
26-
private static final String[] incorrectIRIs = { "0123456789 +-.,!@#$%^&*();\\\\/|<>\\\"\\'", "12345 -98.7 3.141 .6180 9,000 +42", "555.123.4567\t+1-(800)-555-2468", "foodemo.net", "bar.ba.test.co.uk", "www.demo.com", "g.com", "g-.com", "com.g", "-g.com", "xn--d1ai6ai.xn--p1ai", "xn-fsqu00a.xn-0zwm56d", "xn--stackoverflow.com", "stackoverflow.xn--com", "stackoverflow.co.uk", "google.com.au", "-0-0o.com", "0-0o_.com" };
27-
26+
private static final String[] incorrectIRIs = {"0123456789 +-.,!@#$%^&*()","12345 -98.7 3.141","555.123.4567\t+1-(800)","test\nstring","test\rstring","test\u0000string"," ","\u00A0",""," \t ", // Only whitespace
27+
};
2828
@Test
2929
public void guessNamespaceTest() {
3030
assertEquals("http://schema.org/test/test/", IRIUtils.guessNamespace(uriSchema));
@@ -63,8 +63,24 @@ public void isStandardIRITest() {
6363
assertTrue(IRIUtils.isStandardIRI(iri));
6464
}
6565
for (String iri : incorrectIRIs) {
66-
assertFalse(IRIUtils.isStandardIRI(iri));
66+
assertFalse(IRIUtils.isStandardIRI(iri), "Expected '" + escapeForDisplay(iri) + "' to be an invalid IRI");
6767
}
6868
}
6969

70+
/**
71+
* Helper method to escape strings for display in test failure messages
72+
*/
73+
private static String escapeForDisplay(String str) {
74+
StringBuilder sb = new StringBuilder();
75+
for (char c : str.toCharArray()) {
76+
if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) {
77+
sb.append(String.format("\\u%04X", (int) c));
78+
} else {
79+
sb.append(c);
80+
}
81+
}
82+
return sb.toString();
83+
}
84+
85+
7086
}

src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,15 @@ public void constructorCoreseNodeTest() {
6767

6868
@Test
6969
public void constructorStringException() {
70-
assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test"));
70+
71+
assertThrows(IncorrectFormatException.class, () -> new CoreseIRI(" "));
72+
73+
assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("\u00A0"));
74+
75+
assertThrows(IncorrectFormatException.class, () -> new CoreseIRI(""));
76+
77+
assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test string"));
78+
7179
}
7280

7381
}

0 commit comments

Comments
 (0)