99
1010/**
1111 * Utility class for IRI.
12- *
12+ * <p>
1313 * Intended to facilitate string manipulation related to IRI.
1414 */
1515public class IRIUtils {
1616
1717 private static final Pattern IRI_PATTERN = Pattern .compile ("^(?<namespace>(?<protocol>[\\ w\\ -]+):(?<dblSlashes>\\ /\\ /)?(?<domain>([\\ w\\ -_:@]+\\ .)*[\\ w\\ -_:]*))((?<path>\\ /([\\ w\\ -\\ ._\\ :]+\\ /)*)(?<finalPath>[\\ w\\ -\\ ._\\ :]+)?(?<query>\\ ?[\\ w\\ -_\\ :\\ ?\\ =]+)?(\\ #)?(?<fragment>([\\ w\\ -_]+))?)?$" );
1818 private static final Pattern STANDARD_IRI_PATTERN = Pattern .compile ("^(([^:/?#\\ s]+):)(\\ /\\ /([^/?#\\ s]*))?([^?#\\ s]*)(\\ ?([^#\\ s]*))?(#(.*))?" );
19+ private static final Pattern RELATIVE_IRI_PATTERN = Pattern .compile ("^[^\\ s\\ p{Cc}]+$" );
1920 private static final int MAX_IRI_LENGTH = 2048 ;
2021 private static final long REGEX_TIMEOUT_MS = 100 ;
2122
@@ -28,6 +29,7 @@ private IRIUtils() {
2829
2930 /**
3031 * Guesses the namespace of an IRI using a regex pattern.
32+ *
3133 * @param iri The IRI string to be processed.
3234 * @return the guessed namespace of the IRI or an empty string if no match is found.
3335 */
@@ -38,21 +40,22 @@ public static String guessNamespace(String iri) {
3840 try {
3941 Matcher matcher = matchWithTimeout (IRI_PATTERN , iri );
4042 if (matcher == null || !matcher .matches ()) {
41- return "" ;
43+ return iri .endsWith ("#" ) ? iri : (iri .contains ("#" ) ? iri .substring (0 , iri .lastIndexOf ("#" ) + 1 ) : iri );
44+
4245 } else if (matcher .matches ()) {
4346 if (matcher .group ("protocol" ) != null && matcher .group ("protocol" ).equals ("_" )) {
4447 return "" ;
4548 }
4649 StringBuilder namespace = new StringBuilder ();
4750 namespace .append (matcher .group ("protocol" )).append (":" );
48- if (matcher .group ("dblSlashes" ) != null ) {
51+ if (matcher .group ("dblSlashes" ) != null ) {
4952 namespace .append (matcher .group ("dblSlashes" ));
5053 }
5154 namespace .append (matcher .group ("domain" ));
52- if (matcher .group ("path" ) != null ) {
55+ if (matcher .group ("path" ) != null ) {
5356 namespace .append (matcher .group ("path" ));
5457 }
55- if (matcher .group ("fragment" ) != null && matcher .group ("finalPath" ) != null ) {
58+ if (matcher .group ("fragment" ) != null && matcher .group ("finalPath" ) != null ) {
5659 namespace .append (matcher .group ("finalPath" )).append ("#" );
5760 }
5861 return namespace .toString ();
@@ -66,6 +69,7 @@ public static String guessNamespace(String iri) {
6669
6770 /**
6871 * Guesses the local name of an IRI using a regex pattern.
72+ *
6973 * @param iri The IRI string to be processed.
7074 * @return the guessed local name of the IRI or an empty string if no match is found.
7175 */
@@ -76,11 +80,11 @@ public static String guessLocalName(String iri) {
7680 try {
7781 Matcher matcher = matchWithTimeout (IRI_PATTERN , iri );
7882 if (matcher == null || !matcher .matches ()) {
79- return "" ;
83+ return iri ;
8084 } else if (matcher .matches ()) {
81- if (matcher .group ("fragment" ) != null ){ // If the IRI has a fragment
85+ if (matcher .group ("fragment" ) != null ) { // If the IRI has a fragment
8286 return matcher .group ("fragment" );
83- } else if (matcher .group ("finalPath" ) != null ) { // If the IRI has no fragment but do not ends with a slash
87+ } else if (matcher .group ("finalPath" ) != null ) { // If the IRI has no fragment but do not ends with a slash
8488 return matcher .group ("finalPath" );
8589 } else { // If the URI ends with a slash
8690 return "" ;
@@ -95,6 +99,8 @@ public static String guessLocalName(String iri) {
9599
96100 /**
97101 * Checks if the given string is a valid IRI using a regex pattern extracted from the W3C standards.
102+ * Removes leading/trailing whitespace and non-breaking spaces before validation.
103+ *
98104 * @param iriString The string to be checked.
99105 * @return true if the string is a valid IRI, false otherwise.
100106 */
@@ -103,7 +109,49 @@ public static boolean isStandardIRI(String iriString) {
103109 return false ;
104110 }
105111
112+ // Remove leading whitespace and U+00A0 (non-breaking space)
113+ int start = 0 ;
114+ while (start < iriString .length ()) {
115+ char c = iriString .charAt (start );
116+ if (Character .isWhitespace (c ) || c == '\u00A0' || c == 160 ) {
117+ start ++;
118+ } else {
119+ break ;
120+ }
121+ }
122+
123+ // Remove trailing whitespace and U+00A0 (non-breaking space)
124+ int end = iriString .length ();
125+ while (end > start ) {
126+ char c = iriString .charAt (end - 1 );
127+ if (Character .isWhitespace (c ) || c == '\u00A0' || c == 160 ) {
128+ end --;
129+ } else {
130+ break ;
131+ }
132+ }
133+
134+ iriString = iriString .substring (start , end );
135+
136+ if (iriString .isEmpty ()) {
137+ return false ;
138+ }
139+
140+ // Reject IRIs with internal whitespace
141+ for (char c : iriString .toCharArray ()) {
142+ if (Character .isWhitespace (c ) || c == '\u00A0' || c == 160 ) {
143+ return false ;
144+ }
145+ }
146+
106147 try {
148+ // If no scheme (no :), treat as relative IRI
149+ if (!iriString .contains (":" ) || iriString .startsWith ("#" )) {
150+ Matcher matcher = matchWithTimeout (RELATIVE_IRI_PATTERN , iriString );
151+ return matcher != null && matcher .matches ();
152+ }
153+
154+ // If scheme present, validate as absolute IRI
107155 Matcher matcher = matchWithTimeout (STANDARD_IRI_PATTERN , iriString );
108156 if (matcher != null && matcher .matches ()) {
109157 return isValidURI (iriString );
@@ -114,6 +162,7 @@ public static boolean isStandardIRI(String iriString) {
114162 }
115163 }
116164
165+
117166 /**
118167 * Executes regex matching with timeout protection.
119168 */
0 commit comments