44
55namespace Doctrine \SqlFormatter ;
66
7- use function array_combine ;
8- use function array_keys ;
97use function array_map ;
10- use function arsort ;
11- use function assert ;
12- use function implode ;
8+ use function count ;
9+ use function is_int ;
1310use function preg_match ;
1411use function preg_quote ;
12+ use function reset ;
1513use function str_replace ;
14+ use function str_starts_with ;
1615use function strlen ;
1716use function strpos ;
1817use function strtoupper ;
1918use function substr ;
19+ use function usort ;
2020
2121/** @internal */
2222final class Tokenizer
@@ -762,31 +762,12 @@ final class Tokenizer
762762 */
763763 public function __construct ()
764764 {
765- // Sort list from longest word to shortest, 3x faster than usort
766- $ sortByLengthFx = static function ($ values ) {
767- $ valuesMap = array_combine ($ values , array_map (strlen (...), $ values ));
768- assert ($ valuesMap !== false );
769- arsort ($ valuesMap );
770-
771- return array_keys ($ valuesMap );
772- };
773-
774- $ buildRegexFromListFx = static function ($ values ) use ($ sortByLengthFx ) {
775- return '(?> ' . implode (
776- '| ' ,
777- array_map (
778- static fn ($ v ) => preg_quote ($ v , '/ ' ),
779- $ sortByLengthFx ($ values ),
780- ),
781- ) . ') ' ;
782- };
783-
784765 // Set up regular expressions
785- $ regexBoundaries = $ buildRegexFromListFx ($ this ->boundaries );
786- $ regexReserved = $ buildRegexFromListFx ($ this ->reserved );
787- $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedToplevel ));
788- $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedNewline ));
789- $ regexFunction = $ buildRegexFromListFx ($ this ->functions );
766+ $ regexBoundaries = $ this -> makeRegexFromList ($ this ->boundaries );
767+ $ regexReserved = $ this -> makeRegexFromList ($ this ->reserved );
768+ $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ this -> makeRegexFromList ($ this ->reservedToplevel ));
769+ $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ this -> makeRegexFromList ($ this ->reservedNewline ));
770+ $ regexFunction = $ this -> makeRegexFromList ($ this ->functions );
790771
791772 $ this ->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|" \'`| ' . $ regexBoundaries . ')/ ' ;
792773 $ this ->nextTokenRegexBoundaryCharacter = '/\G ' . $ regexBoundaries . '/ ' ;
@@ -797,6 +778,75 @@ public function __construct()
797778 $ this ->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|[" \'`]| ' . $ regexBoundaries . ')/ ' ;
798779 }
799780
781+ /**
782+ * Make regex from a list of values matching longest value first.
783+ *
784+ * Optimized for speed by matching alternative branch only once
785+ * https://github.com/PCRE2Project/pcre2/issues/411 .
786+ *
787+ * @param list<string> $values
788+ */
789+ private function makeRegexFromList (array $ values , bool $ sorted = false ): string
790+ {
791+ // sort list alphabetically and from longest word to shortest
792+ if (! $ sorted ) {
793+ usort ($ values , static function (string $ a , string $ b ) {
794+ return str_starts_with ($ a , $ b ) || str_starts_with ($ b , $ a )
795+ ? strlen ($ b ) <=> strlen ($ a )
796+ : $ a <=> $ b ;
797+ });
798+ }
799+
800+ /** @var array<int|string, list<string>> $valuesBySharedPrefix */
801+ $ valuesBySharedPrefix = [];
802+ $ items = [];
803+ $ prefix = null ;
804+
805+ foreach ($ values as $ v ) {
806+ if ($ prefix !== null && ! str_starts_with ($ v , substr ($ prefix , 0 , 1 ))) {
807+ $ valuesBySharedPrefix [$ prefix ] = $ items ;
808+ $ items = [];
809+ $ prefix = null ;
810+ }
811+
812+ $ items [] = $ v ;
813+
814+ if ($ prefix === null ) {
815+ $ prefix = $ v ;
816+ } else {
817+ while (! str_starts_with ($ v , $ prefix )) {
818+ $ prefix = substr ($ prefix , 0 , -1 );
819+ }
820+ }
821+ }
822+
823+ if ($ items !== []) {
824+ $ valuesBySharedPrefix [$ prefix ] = $ items ;
825+ $ items = [];
826+ $ prefix = null ;
827+ }
828+
829+ $ regex = '(?> ' ;
830+
831+ foreach ($ valuesBySharedPrefix as $ prefix => $ items ) {
832+ if ($ regex !== '(?> ' ) {
833+ $ regex .= '| ' ;
834+ }
835+
836+ if (is_int ($ prefix )) {
837+ $ prefix = (string ) $ prefix ;
838+ }
839+
840+ $ regex .= preg_quote ($ prefix , '/ ' );
841+
842+ $ regex .= count ($ items ) === 1
843+ ? preg_quote (substr (reset ($ items ), strlen ($ prefix )), '/ ' )
844+ : $ this ->makeRegexFromList (array_map (static fn ($ v ) => substr ($ v , strlen ($ prefix )), $ items ), true );
845+ }
846+
847+ return $ regex . ') ' ;
848+ }
849+
800850 /**
801851 * Takes a SQL string and breaks it into tokens.
802852 * Each token is an associative array with type and value.
0 commit comments