1+ /*
2+ * patch for luajit by CppCXY
3+ */
4+
5+ #include <lua.h>
6+ #include <lauxlib.h>
7+ #include <string.h>
8+
9+ // Returns the number of characters in the UTF-8 string `s`
10+ // that start between byte position `i` and `j` (both included).
11+ // The default for `i` and `j` is to consider all characters in the string.
12+ // For negative indices, it starts counting from the end of the string.
13+ // If `lax` is true, the function returns the number of characters in the string,
14+ // even if some of them are invalid.
15+ // Invalid characters are always counted as one character.
16+ // signature: utf8.len(s [, i [, j [, lax]]])
17+ // signature (s, [i], [j], [lax])
18+ int luajit_utf8_len (lua_State * L )
19+ {
20+ size_t len ;
21+ const char * s = luaL_checklstring (L , 1 , & len );
22+ lua_Integer i = luaL_optinteger (L , 2 , 1 );
23+ lua_Integer j = luaL_optinteger (L , 3 , len );
24+ int lax = lua_toboolean (L , 4 );
25+
26+ // Adjust negative indices
27+ if (i < 0 )
28+ i += len + 1 ;
29+ if (j < 0 )
30+ j += len + 1 ;
31+
32+ // Clamp indices to the string boundaries
33+ if (i < 1 )
34+ i = 1 ;
35+ if (j > (lua_Integer )len )
36+ j = len ;
37+ if (i > j )
38+ {
39+ lua_pushinteger (L , 0 );
40+ return 1 ;
41+ }
42+
43+ size_t start = i - 1 ;
44+ size_t end = j - 1 ;
45+ size_t count = 0 ;
46+
47+ // Traverse the string to count characters
48+ for (size_t p = start ; p <= end ;)
49+ {
50+ if ((s [p ] & 0xC0 ) != 0x80 )
51+ {
52+ count ++ ;
53+ }
54+ if (!lax && (s [p ] & 0xC0 ) == 0x80 )
55+ {
56+ // Invalid UTF-8 sequence
57+ p ++ ;
58+ continue ;
59+ }
60+ p ++ ;
61+ }
62+
63+ lua_pushinteger (L , count );
64+ return 1 ;
65+ }
66+
67+ // signature (s, n, [i])
68+ int luajit_utf8_offset (lua_State * L )
69+ {
70+ // Get the string and the integer n from the Lua stack
71+ size_t len ;
72+ const char * s = luaL_checklstring (L , 1 , & len );
73+ lua_Integer n = luaL_checkinteger (L , 2 );
74+ lua_Integer i = luaL_optinteger (L , 3 , 1 );
75+
76+ // Adjust the starting index to be 0-based
77+ if (i < 1 )
78+ i = 1 ;
79+ size_t p = i - 1 ;
80+
81+ // Traverse the string to find the byte offset of the nth UTF-8 character
82+ lua_Integer count = 0 ;
83+ while (p < len )
84+ {
85+ // Check if the current byte is the start of a UTF-8 character
86+ if ((s [p ] & 0xC0 ) != 0x80 )
87+ {
88+ count ++ ;
89+ if (count == n )
90+ {
91+ lua_pushinteger (L , p + 1 ); // Lua uses 1-based indexing
92+ return 1 ;
93+ }
94+ }
95+ p ++ ;
96+ }
97+
98+ // If we reach here, it means the nth character was not found
99+ lua_pushnil (L );
100+ }
101+
102+ // Receives zero or more integers,
103+ // converts each one to its corresponding UTF-8 byte sequence and returns a string with the concatenation of
104+ // all these sequences.
105+ int luajit_utf8_char (lua_State * L )
106+ {
107+ int n = lua_gettop (L ); // Number of arguments
108+ luaL_Buffer b ;
109+ luaL_buffinit (L , & b );
110+
111+ for (int i = 1 ; i <= n ; i ++ )
112+ {
113+ lua_Integer code = luaL_checkinteger (L , i );
114+ if (code < 0x80 )
115+ {
116+ // 1-byte sequence
117+ luaL_addchar (& b , (char )code );
118+ }
119+ else if (code < 0x800 )
120+ {
121+ // 2-byte sequence
122+ luaL_addchar (& b , (char )(0xC0 | (code >> 6 )));
123+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
124+ }
125+ else if (code < 0x10000 )
126+ {
127+ // 3-byte sequence
128+ luaL_addchar (& b , (char )(0xE0 | (code >> 12 )));
129+ luaL_addchar (& b , (char )(0x80 | ((code >> 6 ) & 0x3F )));
130+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
131+ }
132+ else if (code < 0x110000 )
133+ {
134+ // 4-byte sequence
135+ luaL_addchar (& b , (char )(0xF0 | (code >> 18 )));
136+ luaL_addchar (& b , (char )(0x80 | ((code >> 12 ) & 0x3F )));
137+ luaL_addchar (& b , (char )(0x80 | ((code >> 6 ) & 0x3F )));
138+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
139+ }
140+ else
141+ {
142+ return luaL_error (L , "invalid UTF-8 code point" );
143+ }
144+ }
145+
146+ luaL_pushresult (& b );
147+ return 1 ;
148+ }
149+
150+ // Helper function to decode a single UTF-8 character
151+ static int luajit_utf8_decode (const char * s , int * len )
152+ {
153+ unsigned char c = s [0 ];
154+ if (c < 0x80 )
155+ {
156+ * len = 1 ;
157+ return c ;
158+ }
159+ else if (c < 0xE0 )
160+ {
161+ * len = 2 ;
162+ return ((c & 0x1F ) << 6 ) | (s [1 ] & 0x3F );
163+ }
164+ else if (c < 0xF0 )
165+ {
166+ * len = 3 ;
167+ return ((c & 0x0F ) << 12 ) | ((s [1 ] & 0x3F ) << 6 ) | (s [2 ] & 0x3F );
168+ }
169+ else
170+ {
171+ * len = 4 ;
172+ return ((c & 0x07 ) << 18 ) | ((s [1 ] & 0x3F ) << 12 ) | ((s [2 ] & 0x3F ) << 6 ) | (s [3 ] & 0x3F );
173+ }
174+ }
175+
176+ // Returns the codepoints (as integers) from all characters
177+ // in `s` that start between byte position `i` and `j` (both included).
178+ // signature (s [i], [j], [lax]) -> multiple integer values
179+ int luajit_utf8_codepoint (lua_State * L )
180+ {
181+ size_t len ;
182+ const char * s = luaL_checklstring (L , 1 , & len );
183+ lua_Integer i = luaL_optinteger (L , 2 , 1 );
184+ lua_Integer j = luaL_optinteger (L , 3 , len );
185+ int lax = lua_toboolean (L , 4 );
186+
187+ // Adjust negative indices
188+ if (i < 0 )
189+ i += len + 1 ;
190+ if (j < 0 )
191+ j += len + 1 ;
192+
193+ // Clamp indices to the string boundaries
194+ if (i < 1 )
195+ i = 1 ;
196+ if (j > (lua_Integer )len )
197+ j = len ;
198+ if (i > j )
199+ {
200+ lua_pushnil (L );
201+ return 1 ;
202+ }
203+
204+ size_t pos = i - 1 ;
205+ int char_len ;
206+ int codepoint = luajit_utf8_decode (s + pos , & char_len );
207+
208+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
209+ {
210+ lua_pushnil (L );
211+ return 1 ;
212+ }
213+
214+ lua_pushinteger (L , codepoint ); // Push the first code point
215+
216+ if (i == j )
217+ {
218+ return 1 ; // Return the single code point
219+ }
220+
221+ int count = 1 ;
222+ pos += char_len ;
223+ while (pos < (size_t )j )
224+ {
225+ codepoint = luajit_utf8_decode (s + pos , & char_len );
226+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
227+ {
228+ lua_pushnil (L );
229+ return 1 ;
230+ }
231+ lua_pushinteger (L , codepoint ); // Push the code point
232+ count ++ ;
233+ pos += char_len ;
234+ }
235+
236+ return count ; // Return the number of code points
237+ }
238+
239+ // Iterator function
240+ static int utf8_codes_iter (lua_State * L , int lax )
241+ {
242+ size_t len ;
243+ const char * s = luaL_checklstring (L , 1 , & len );
244+ int pos = luaL_checkinteger (L , 2 );
245+
246+ if (pos >= (int )len )
247+ {
248+ return 0 ; // End of iteration
249+ }
250+
251+ int char_len ;
252+ int codepoint = luajit_utf8_decode (s + pos , & char_len );
253+
254+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
255+ {
256+ return luaL_error (L , "invalid UTF-8 byte sequence" );
257+ }
258+
259+ lua_pushinteger (L , pos + 1 ); // Next position
260+ lua_pushinteger (L , codepoint ); // Code point
261+ return 2 ;
262+ }
263+
264+ static int iter_codes_strict (lua_State * L )
265+ {
266+ return utf8_codes_iter (L , 0 );
267+ }
268+
269+ static int iter_codes_lax (lua_State * L )
270+ {
271+ return utf8_codes_iter (L , 1 );
272+ }
273+
274+ // Returns values so that the construction
275+ // ```lua
276+ // for p, c in utf8.codes(s) do
277+ // body
278+ // end
279+ // ```
280+ // will iterate over all UTF-8 characters in string s, with p being the position (in bytes) and c the code point of each character. It raises an error if it meets any invalid byte sequence.
281+ // signature (s [, lax]) -> fun(s: string, p: integer):integer, integer
282+ int luajit_utf8_codes (lua_State * L )
283+ {
284+ int lax = lua_toboolean (L , 2 );
285+ const char * s = luaL_checkstring (L , 1 );
286+ lua_pushcfunction (L , lax ? iter_codes_lax : iter_codes_strict );
287+ lua_pushvalue (L , 1 );
288+ lua_pushinteger (L , 0 );
289+ return 3 ;
290+ }
0 commit comments