5353 (all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
5454} while (0)
5555
56- #define MB_FAILURE (pos , advance ) do { \
56+ #define MB_FAILURE_NO_STATUS (pos , advance ) do { \
5757 *cursor = pos + (advance); \
58- *status = FAILURE; \
5958 return 0; \
6059} while (0)
6160
61+ #define MB_FAILURE (pos , advance ) do { \
62+ *status = FAILURE; \
63+ MB_FAILURE_NO_STATUS(pos, advance); \
64+ } while (0)
65+
6266#define CHECK_LEN (pos , chars_need ) ((str_len - (pos)) >= (chars_need))
6367
6468/* valid as single byte character or leading byte */
@@ -85,6 +89,87 @@ static char *get_default_charset(void) {
8589}
8690/* }}} */
8791
92+ /* Decodes the next UTF-8 multibyte codepoint (i.e. >= 2 bytes).
93+ * Uses `c` as the leading byte. */
94+ PHPAPI unsigned int php_next_utf8_char_mb (
95+ const unsigned char * str ,
96+ unsigned char c ,
97+ size_t str_len ,
98+ size_t * cursor )
99+ {
100+ size_t pos = * cursor ;
101+ unsigned int this_char = 0 ;
102+
103+ /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
104+ * "In a reported illegal byte sequence, do not include any
105+ * non-initial byte that encodes a valid character or is a leading
106+ * byte for a valid sequence." */
107+
108+ ZEND_ASSERT (c >= 0x80 );
109+
110+ if (UNEXPECTED (c < 0xc2 )) {
111+ MB_FAILURE_NO_STATUS (pos , 1 );
112+ } else if (c < 0xe0 ) {
113+ if (UNEXPECTED (!CHECK_LEN (pos , 2 )))
114+ MB_FAILURE_NO_STATUS (pos , 1 );
115+
116+ if (UNEXPECTED (!utf8_trail (str [pos + 1 ]))) {
117+ MB_FAILURE_NO_STATUS (pos , utf8_lead (str [pos + 1 ]) ? 1 : 2 );
118+ }
119+ this_char = ((c & 0x1f ) << 6 ) | (str [pos + 1 ] & 0x3f );
120+ if (UNEXPECTED (this_char < 0x80 )) { /* non-shortest form */
121+ MB_FAILURE_NO_STATUS (pos , 2 );
122+ }
123+ pos += 2 ;
124+ } else if (c < 0xf0 ) {
125+ size_t avail = str_len - pos ;
126+
127+ if (UNEXPECTED (avail < 3 ||
128+ !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]))) {
129+ if (avail < 2 || utf8_lead (str [pos + 1 ]))
130+ MB_FAILURE_NO_STATUS (pos , 1 );
131+ else if (avail < 3 || utf8_lead (str [pos + 2 ]))
132+ MB_FAILURE_NO_STATUS (pos , 2 );
133+ else
134+ MB_FAILURE_NO_STATUS (pos , 3 );
135+ }
136+
137+ this_char = ((c & 0x0f ) << 12 ) | ((str [pos + 1 ] & 0x3f ) << 6 ) | (str [pos + 2 ] & 0x3f );
138+ if (UNEXPECTED (this_char < 0x800 )) { /* non-shortest form */
139+ MB_FAILURE_NO_STATUS (pos , 3 );
140+ } else if (UNEXPECTED (this_char >= 0xd800 && this_char <= 0xdfff )) { /* surrogate */
141+ MB_FAILURE_NO_STATUS (pos , 3 );
142+ }
143+ pos += 3 ;
144+ } else if (c < 0xf5 ) {
145+ size_t avail = str_len - pos ;
146+
147+ if (UNEXPECTED (avail < 4 ||
148+ !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]) ||
149+ !utf8_trail (str [pos + 3 ]))) {
150+ if (avail < 2 || utf8_lead (str [pos + 1 ]))
151+ MB_FAILURE_NO_STATUS (pos , 1 );
152+ else if (avail < 3 || utf8_lead (str [pos + 2 ]))
153+ MB_FAILURE_NO_STATUS (pos , 2 );
154+ else if (avail < 4 || utf8_lead (str [pos + 3 ]))
155+ MB_FAILURE_NO_STATUS (pos , 3 );
156+ else
157+ MB_FAILURE_NO_STATUS (pos , 4 );
158+ }
159+
160+ this_char = ((c & 0x07 ) << 18 ) | ((str [pos + 1 ] & 0x3f ) << 12 ) | ((str [pos + 2 ] & 0x3f ) << 6 ) | (str [pos + 3 ] & 0x3f );
161+ if (UNEXPECTED (this_char < 0x10000 || this_char > 0x10FFFF )) { /* non-shortest form or outside range */
162+ MB_FAILURE_NO_STATUS (pos , 4 );
163+ }
164+ pos += 4 ;
165+ } else {
166+ MB_FAILURE_NO_STATUS (pos , 1 );
167+ }
168+
169+ * cursor = pos ;
170+ return this_char ;
171+ }
172+
88173/* {{{ get_next_char */
89174static inline unsigned int get_next_char (
90175 enum entity_charset charset ,
@@ -105,72 +190,17 @@ static inline unsigned int get_next_char(
105190 switch (charset ) {
106191 case cs_utf_8 :
107192 {
108- /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
109- * "In a reported illegal byte sequence, do not include any
110- * non-initial byte that encodes a valid character or is a leading
111- * byte for a valid sequence." */
112193 unsigned char c ;
113194 c = str [pos ];
114195 if (c < 0x80 ) {
115196 this_char = c ;
116197 pos ++ ;
117- } else if (c < 0xc2 ) {
118- MB_FAILURE (pos , 1 );
119- } else if (c < 0xe0 ) {
120- if (!CHECK_LEN (pos , 2 ))
121- MB_FAILURE (pos , 1 );
122-
123- if (!utf8_trail (str [pos + 1 ])) {
124- MB_FAILURE (pos , utf8_lead (str [pos + 1 ]) ? 1 : 2 );
125- }
126- this_char = ((c & 0x1f ) << 6 ) | (str [pos + 1 ] & 0x3f );
127- if (this_char < 0x80 ) { /* non-shortest form */
128- MB_FAILURE (pos , 2 );
129- }
130- pos += 2 ;
131- } else if (c < 0xf0 ) {
132- size_t avail = str_len - pos ;
133-
134- if (avail < 3 ||
135- !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ])) {
136- if (avail < 2 || utf8_lead (str [pos + 1 ]))
137- MB_FAILURE (pos , 1 );
138- else if (avail < 3 || utf8_lead (str [pos + 2 ]))
139- MB_FAILURE (pos , 2 );
140- else
141- MB_FAILURE (pos , 3 );
142- }
143-
144- this_char = ((c & 0x0f ) << 12 ) | ((str [pos + 1 ] & 0x3f ) << 6 ) | (str [pos + 2 ] & 0x3f );
145- if (this_char < 0x800 ) { /* non-shortest form */
146- MB_FAILURE (pos , 3 );
147- } else if (this_char >= 0xd800 && this_char <= 0xdfff ) { /* surrogate */
148- MB_FAILURE (pos , 3 );
149- }
150- pos += 3 ;
151- } else if (c < 0xf5 ) {
152- size_t avail = str_len - pos ;
153-
154- if (avail < 4 ||
155- !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]) ||
156- !utf8_trail (str [pos + 3 ])) {
157- if (avail < 2 || utf8_lead (str [pos + 1 ]))
158- MB_FAILURE (pos , 1 );
159- else if (avail < 3 || utf8_lead (str [pos + 2 ]))
160- MB_FAILURE (pos , 2 );
161- else if (avail < 4 || utf8_lead (str [pos + 3 ]))
162- MB_FAILURE (pos , 3 );
163- else
164- MB_FAILURE (pos , 4 );
165- }
166-
167- this_char = ((c & 0x07 ) << 18 ) | ((str [pos + 1 ] & 0x3f ) << 12 ) | ((str [pos + 2 ] & 0x3f ) << 6 ) | (str [pos + 3 ] & 0x3f );
168- if (this_char < 0x10000 || this_char > 0x10FFFF ) { /* non-shortest form or outside range */
169- MB_FAILURE (pos , 4 );
170- }
171- pos += 4 ;
172198 } else {
173- MB_FAILURE (pos , 1 );
199+ this_char = php_next_utf8_char_mb (str , c , str_len , cursor );
200+ if (UNEXPECTED (this_char == 0 )) {
201+ * status = FAILURE ;
202+ }
203+ return this_char ;
174204 }
175205 }
176206 break ;
0 commit comments