@@ -720,11 +720,13 @@ final class Tokenizer
720720
721721 // Regular expressions for tokenizing
722722
723- private readonly string $ regexBoundaries ;
724- private readonly string $ regexReserved ;
725- private readonly string $ regexReservedNewline ;
726- private readonly string $ regexReservedToplevel ;
727- private readonly string $ regexFunction ;
723+ private readonly string $ nextTokenRegexNumber ;
724+ private readonly string $ nextTokenRegexBoundaryCharacter ;
725+ private readonly string $ nextTokenRegexReservedToplevel ;
726+ private readonly string $ nextTokenRegexReservedNewline ;
727+ private readonly string $ nextTokenRegexReserved ;
728+ private readonly string $ nextTokenRegexFunction ;
729+ private readonly string $ nextTokenRegexNonReserved ;
728730
729731 /**
730732 * Punctuation that can be used as a boundary between other tokens
@@ -769,25 +771,30 @@ public function __construct()
769771 return array_keys ($ valuesMap );
770772 };
771773
772- // Set up regular expressions
773- $ this ->regexBoundaries = '( ' . implode (
774- '| ' ,
775- $ this ->quoteRegex ($ this ->boundaries ),
776- ) . ') ' ;
777- $ this ->regexReserved = '( ' . implode (
778- '| ' ,
779- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reserved )),
780- ) . ') ' ;
781- $ this ->regexReservedToplevel = str_replace (' ' , '\s+ ' , '( ' . implode (
782- '| ' ,
783- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reservedToplevel )),
784- ) . ') ' );
785- $ this ->regexReservedNewline = str_replace (' ' , '\s+ ' , '( ' . implode (
786- '| ' ,
787- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reservedNewline )),
788- ) . ') ' );
774+ $ buildRegexFromListFx = static function ($ values ) use ($ sortByLengthFx ) {
775+ return '(?> ' . implode (
776+ '| ' ,
777+ array_map (
778+ static fn ($ v ) => preg_quote ($ v , '/ ' ),
779+ $ sortByLengthFx ($ values ),
780+ ),
781+ ) . ') ' ;
782+ };
789783
790- $ this ->regexFunction = '( ' . implode ('| ' , $ this ->quoteRegex ($ sortByLengthFx ($ this ->functions ))) . ') ' ;
784+ // Set up regular expressions
785+ $ regexBoundaries = $ buildRegexFromListFx ($ this ->boundaries );
786+ $ regexReserved = $ buildRegexFromListFx ($ this ->reserved );
787+ $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedToplevel ));
788+ $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedNewline ));
789+ $ regexFunction = $ buildRegexFromListFx ($ this ->functions );
790+
791+ $ this ->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|" \'`| ' . $ regexBoundaries . ')/ ' ;
792+ $ this ->nextTokenRegexBoundaryCharacter = '/\G ' . $ regexBoundaries . '/ ' ;
793+ $ this ->nextTokenRegexReservedToplevel = '/\G ' . $ regexReservedToplevel . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
794+ $ this ->nextTokenRegexReservedNewline = '/\G ' . $ regexReservedNewline . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
795+ $ this ->nextTokenRegexReserved = '/\G ' . $ regexReserved . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
796+ $ this ->nextTokenRegexFunction = '/\G ' . $ regexFunction . '(?=\s*\()/ ' ;
797+ $ this ->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|[" \'`]| ' . $ regexBoundaries . ')/ ' ;
791798 }
792799
793800 /**
@@ -829,7 +836,6 @@ public function tokenize(string $string): Cursor
829836 */
830837 private function createNextToken (string $ string , string $ upper , int $ offset , Token |null $ previous = null ): Token
831838 {
832- $ matches = [];
833839 // Whitespace
834840 if (preg_match ('/\G\s+/ ' , $ string , $ matches , 0 , $ offset )) {
835841 return new Token (Token::TOKEN_TYPE_WHITESPACE , $ matches [0 ]);
@@ -883,9 +889,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
883889 $ value = $ firstChar . $ this ->getNextQuotedString ($ string , $ offset + 1 );
884890 } else {
885891 // Non-quoted variable name
886- preg_match ('/\G( ' . $ firstChar . ' [ \w.$]+) / ' , $ string , $ matches , 0 , $ offset );
892+ preg_match ('/\G[@:][ \w.$]+/ ' , $ string , $ matches , 0 , $ offset );
887893 if ($ matches ) {
888- $ value = $ matches [1 ];
894+ $ value = $ matches [0 ];
889895 }
890896 }
891897
@@ -897,19 +903,19 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
897903 // Number (decimal, binary, or hex)
898904 if (
899905 preg_match (
900- ' /\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|" \' `| ' . $ this ->regexBoundaries . ' )/ ' ,
906+ $ this ->nextTokenRegexNumber ,
901907 $ string ,
902908 $ matches ,
903909 0 ,
904910 $ offset ,
905911 )
906912 ) {
907- return new Token (Token::TOKEN_TYPE_NUMBER , $ matches [1 ]);
913+ return new Token (Token::TOKEN_TYPE_NUMBER , $ matches [0 ]);
908914 }
909915
910916 // Boundary Character (punctuation and symbols)
911- if (preg_match (' /\G( ' . $ this ->regexBoundaries . ' )/ ' , $ string , $ matches , 0 , $ offset )) {
912- return new Token (Token::TOKEN_TYPE_BOUNDARY , $ matches [1 ]);
917+ if (preg_match ($ this ->nextTokenRegexBoundaryCharacter , $ string , $ matches , 0 , $ offset )) {
918+ return new Token (Token::TOKEN_TYPE_BOUNDARY , $ matches [0 ]);
913919 }
914920
915921 // A reserved word cannot be preceded by a '.'
@@ -918,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
918924 // Top Level Reserved Word
919925 if (
920926 preg_match (
921- ' /\G( ' . $ this ->regexReservedToplevel . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
927+ $ this ->nextTokenRegexReservedToplevel ,
922928 $ upper ,
923929 $ matches ,
924930 0 ,
@@ -927,14 +933,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
927933 ) {
928934 return new Token (
929935 Token::TOKEN_TYPE_RESERVED_TOPLEVEL ,
930- substr ($ string , $ offset , strlen ($ matches [1 ])),
936+ substr ($ string , $ offset , strlen ($ matches [0 ])),
931937 );
932938 }
933939
934940 // Newline Reserved Word
935941 if (
936942 preg_match (
937- ' /\G( ' . $ this ->regexReservedNewline . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
943+ $ this ->nextTokenRegexReservedNewline ,
938944 $ upper ,
939945 $ matches ,
940946 0 ,
@@ -943,14 +949,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
943949 ) {
944950 return new Token (
945951 Token::TOKEN_TYPE_RESERVED_NEWLINE ,
946- substr ($ string , $ offset , strlen ($ matches [1 ])),
952+ substr ($ string , $ offset , strlen ($ matches [0 ])),
947953 );
948954 }
949955
950956 // Other Reserved Word
951957 if (
952958 preg_match (
953- ' /\G( ' . $ this ->regexReserved . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
959+ $ this ->nextTokenRegexReserved ,
954960 $ upper ,
955961 $ matches ,
956962 0 ,
@@ -959,40 +965,24 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
959965 ) {
960966 return new Token (
961967 Token::TOKEN_TYPE_RESERVED ,
962- substr ($ string , $ offset , strlen ($ matches [1 ])),
968+ substr ($ string , $ offset , strlen ($ matches [0 ])),
963969 );
964970 }
965971 }
966972
967973 // A function must be succeeded by '('
968- // this makes it so "count(" is considered a function, but "count" alone is not
969- // function
970- if (preg_match ('/\G( ' . $ this ->regexFunction . '[(]|\s|[)])/ ' , $ upper , $ matches , 0 , $ offset )) {
974+ // this makes it so "count(" is considered a function, but "count" alone is not function
975+ if (preg_match ($ this ->nextTokenRegexFunction , $ upper , $ matches , 0 , $ offset )) {
971976 return new Token (
972977 Token::TOKEN_TYPE_RESERVED ,
973- substr ($ string , $ offset , strlen ($ matches [1 ]) - 1 ),
978+ substr ($ string , $ offset , strlen ($ matches [0 ]) ),
974979 );
975980 }
976981
977982 // Non reserved word
978- preg_match ('/\G(.*?)($|\s|[" \'`]| ' . $ this ->regexBoundaries . ')/ ' , $ string , $ matches , 0 , $ offset );
979-
980- return new Token (Token::TOKEN_TYPE_WORD , $ matches [1 ]);
981- }
983+ preg_match ($ this ->nextTokenRegexNonReserved , $ string , $ matches , 0 , $ offset );
982984
983- /**
984- * Helper function for building regular expressions for reserved words and boundary characters
985- *
986- * @param string[] $strings The strings to be quoted
987- *
988- * @return string[] The quoted strings
989- */
990- private function quoteRegex (array $ strings ): array
991- {
992- return array_map (
993- static fn (string $ string ): string => preg_quote ($ string , '/ ' ),
994- $ strings ,
995- );
985+ return new Token (Token::TOKEN_TYPE_WORD , $ matches [0 ]);
996986 }
997987
998988 private function getNextQuotedString (string $ string , int $ offset ): string
0 commit comments