Skip to content

Commit 7d3ff7b

Browse files
committed
Build regexes only once
-63% improved runtime
1 parent 24d857e commit 7d3ff7b

File tree

1 file changed

+30
-22
lines changed

1 file changed

+30
-22
lines changed

src/Tokenizer.php

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -720,11 +720,13 @@ final class Tokenizer
720720

721721
// Regular expressions for tokenizing
722722

723-
private readonly string $regexBoundaries;
724-
private readonly string $regexReserved;
725-
private readonly string $regexReservedNewline;
726-
private readonly string $regexReservedToplevel;
727-
private readonly string $regexFunction;
723+
private readonly string $nextTokenRegexNumber;
724+
private readonly string $nextTokenRegexBoundaryCharacter;
725+
private readonly string $nextTokenRegexReservedToplevel;
726+
private readonly string $nextTokenRegexReservedNewline;
727+
private readonly string $nextTokenRegexReserved;
728+
private readonly string $nextTokenRegexFunction;
729+
private readonly string $nextTokenRegexNonReserved;
728730

729731
/**
730732
* Punctuation that can be used as a boundary between other tokens
@@ -770,24 +772,31 @@ public function __construct()
770772
};
771773

772774
// Set up regular expressions
773-
$this->regexBoundaries = '(?>' . implode(
775+
$regexBoundaries = '(?>' . implode(
774776
'|',
775777
$this->quoteRegex($this->boundaries),
776778
) . ')';
777-
$this->regexReserved = '(?>' . implode(
779+
$regexReserved = '(?>' . implode(
778780
'|',
779781
$this->quoteRegex($sortByLengthFx($this->reserved)),
780782
) . ')';
781-
$this->regexReservedToplevel = str_replace(' ', '\s+', '(?>' . implode(
783+
$regexReservedToplevel = '(?>' . str_replace(' ', '\s+', implode(
782784
'|',
783785
$this->quoteRegex($sortByLengthFx($this->reservedToplevel)),
784-
) . ')');
785-
$this->regexReservedNewline = str_replace(' ', '\s+', '(?>' . implode(
786+
)) . ')';
787+
$regexReservedNewline = '(?>' . str_replace(' ', '\s+', implode(
786788
'|',
787789
$this->quoteRegex($sortByLengthFx($this->reservedNewline)),
788-
) . ')');
790+
)) . ')';
791+
$regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
789792

790-
$this->regexFunction = '(?>' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
793+
$this->nextTokenRegexNumber = '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $regexBoundaries . ')/';
794+
$this->nextTokenRegexBoundaryCharacter = '/\G(' . $regexBoundaries . ')/';
795+
$this->nextTokenRegexReservedToplevel = '/\G(' . $regexReservedToplevel . ')($|\s|' . $regexBoundaries . ')/';
796+
$this->nextTokenRegexReservedNewline = '/\G(' . $regexReservedNewline . ')($|\s|' . $regexBoundaries . ')/';
797+
$this->nextTokenRegexReserved = '/\G(' . $regexReserved . ')($|\s|' . $regexBoundaries . ')/';
798+
$this->nextTokenRegexFunction = '/\G(' . $regexFunction . '[(]|\s|[)])/';
799+
$this->nextTokenRegexNonReserved = '/\G(.*?)($|\s|["\'`]|' . $regexBoundaries . ')/';
791800
}
792801

793802
/**
@@ -883,7 +892,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
883892
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
884893
} else {
885894
// Non-quoted variable name
886-
preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset);
895+
preg_match('/\G([@:][\w.$]+)/', $string, $matches, 0, $offset);
887896
if ($matches) {
888897
$value = $matches[1];
889898
}
@@ -897,7 +906,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
897906
// Number (decimal, binary, or hex)
898907
if (
899908
preg_match(
900-
'/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/',
909+
$this->nextTokenRegexNumber,
901910
$string,
902911
$matches,
903912
0,
@@ -908,7 +917,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
908917
}
909918

910919
// Boundary Character (punctuation and symbols)
911-
if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) {
920+
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
912921
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]);
913922
}
914923

@@ -918,7 +927,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
918927
// Top Level Reserved Word
919928
if (
920929
preg_match(
921-
'/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/',
930+
$this->nextTokenRegexReservedToplevel,
922931
$upper,
923932
$matches,
924933
0,
@@ -934,7 +943,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
934943
// Newline Reserved Word
935944
if (
936945
preg_match(
937-
'/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/',
946+
$this->nextTokenRegexReservedNewline,
938947
$upper,
939948
$matches,
940949
0,
@@ -950,7 +959,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
950959
// Other Reserved Word
951960
if (
952961
preg_match(
953-
'/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/',
962+
$this->nextTokenRegexReserved,
954963
$upper,
955964
$matches,
956965
0,
@@ -965,17 +974,16 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
965974
}
966975

967976
// A function must be succeeded by '('
968-
// this makes it so "count(" is considered a function, but "count" alone is not
969-
// function
970-
if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) {
977+
// this makes it so "count(" is considered a function, but "count" alone is not function
978+
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
971979
return new Token(
972980
Token::TOKEN_TYPE_RESERVED,
973981
substr($string, $offset, strlen($matches[1]) - 1),
974982
);
975983
}
976984

977985
// Non reserved word
978-
preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset);
986+
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);
979987

980988
return new Token(Token::TOKEN_TYPE_WORD, $matches[1]);
981989
}

0 commit comments

Comments
 (0)