11const std = @import ("../std.zig" );
2- const mem = std .mem ;
32
43pub const Token = struct {
54 tag : Tag ,
@@ -350,7 +349,7 @@ pub const Tokenizer = struct {
350349
351350 pub fn init (buffer : [:0 ]const u8 ) Tokenizer {
352351 // Skip the UTF-8 BOM if present
353- const src_start = if (mem .startsWith (u8 , buffer , "\xEF\xBB\xBF " )) 3 else @as ( usize , 0 ) ;
352+ const src_start : usize = if (std . mem .startsWith (u8 , buffer , "\xEF\xBB\xBF " )) 3 else 0 ;
354353 return Tokenizer {
355354 .buffer = buffer ,
356355 .index = src_start ,
@@ -1433,8 +1432,8 @@ pub const Tokenizer = struct {
14331432
14341433 fn getInvalidCharacterLength (self : * Tokenizer ) u3 {
14351434 const c0 = self .buffer [self .index ];
1436- if (c0 < 0x80 ) {
1437- if (c0 < 0x20 or c0 == 0x7f ) {
1435+ if (std . ascii . isASCII ( c0 ) ) {
1436+ if (std . ascii . isCntrl ( c0 ) ) {
14381437 // ascii control codes are never allowed
14391438 // (note that \n was checked before we got here)
14401439 return 1 ;
@@ -1469,8 +1468,8 @@ pub const Tokenizer = struct {
14691468 }
14701469};
14711470
1472- test "tokenizer " {
1473- try testTokenize ("test" , &.{.keyword_test });
1471+ test "keywords " {
1472+ try testTokenize ("test const else " , &.{ .keyword_test , .keyword_const , .keyword_else });
14741473}
14751474
14761475test "line comment followed by top-level comptime" {
@@ -1485,7 +1484,7 @@ test "line comment followed by top-level comptime" {
14851484 });
14861485}
14871486
1488- test "tokenizer - unknown length pointer and then c pointer" {
1487+ test "unknown length pointer and then c pointer" {
14891488 try testTokenize (
14901489 \\[*]u8
14911490 \\[*c]u8
@@ -1502,7 +1501,7 @@ test "tokenizer - unknown length pointer and then c pointer" {
15021501 });
15031502}
15041503
1505- test "tokenizer - code point literal with hex escape" {
1504+ test "code point literal with hex escape" {
15061505 try testTokenize (
15071506 \\'\x1b'
15081507 , &.{.char_literal });
@@ -1511,21 +1510,21 @@ test "tokenizer - code point literal with hex escape" {
15111510 , &.{ .invalid , .invalid });
15121511}
15131512
1514- test "tokenizer - newline in char literal" {
1513+ test "newline in char literal" {
15151514 try testTokenize (
15161515 \\'
15171516 \\'
15181517 , &.{ .invalid , .invalid });
15191518}
15201519
1521- test "tokenizer - newline in string literal" {
1520+ test "newline in string literal" {
15221521 try testTokenize (
15231522 \\"
15241523 \\"
15251524 , &.{ .invalid , .string_literal });
15261525}
15271526
1528- test "tokenizer - code point literal with unicode escapes" {
1527+ test "code point literal with unicode escapes" {
15291528 // Valid unicode escapes
15301529 try testTokenize (
15311530 \\'\u{3}'
@@ -1575,13 +1574,13 @@ test "tokenizer - code point literal with unicode escapes" {
15751574 , &.{ .invalid , .integer_literal , .invalid });
15761575}
15771576
1578- test "tokenizer - code point literal with unicode code point" {
1577+ test "code point literal with unicode code point" {
15791578 try testTokenize (
15801579 \\'💩'
15811580 , &.{.char_literal });
15821581}
15831582
1584- test "tokenizer - float literal e exponent" {
1583+ test "float literal e exponent" {
15851584 try testTokenize ("a = 4.94065645841246544177e-324;\n " , &.{
15861585 .identifier ,
15871586 .equal ,
@@ -1590,7 +1589,7 @@ test "tokenizer - float literal e exponent" {
15901589 });
15911590}
15921591
1593- test "tokenizer - float literal p exponent" {
1592+ test "float literal p exponent" {
15941593 try testTokenize ("a = 0x1.a827999fcef32p+1022;\n " , &.{
15951594 .identifier ,
15961595 .equal ,
@@ -1599,19 +1598,19 @@ test "tokenizer - float literal p exponent" {
15991598 });
16001599}
16011600
1602- test "tokenizer - chars" {
1601+ test "chars" {
16031602 try testTokenize ("'c'" , &.{.char_literal });
16041603}
16051604
1606- test "tokenizer - invalid token characters" {
1605+ test "invalid token characters" {
16071606 try testTokenize ("#" , &.{.invalid });
16081607 try testTokenize ("`" , &.{.invalid });
16091608 try testTokenize ("'c" , &.{.invalid });
16101609 try testTokenize ("'" , &.{.invalid });
16111610 try testTokenize ("''" , &.{ .invalid , .invalid });
16121611}
16131612
1614- test "tokenizer - invalid literal/comment characters" {
1613+ test "invalid literal/comment characters" {
16151614 try testTokenize ("\" \x00 \" " , &.{
16161615 .string_literal ,
16171616 .invalid ,
@@ -1627,12 +1626,12 @@ test "tokenizer - invalid literal/comment characters" {
16271626 });
16281627}
16291628
1630- test "tokenizer - utf8" {
1629+ test "utf8" {
16311630 try testTokenize ("//\xc2\x80 " , &.{});
16321631 try testTokenize ("//\xf4\x8f\xbf\xbf " , &.{});
16331632}
16341633
1635- test "tokenizer - invalid utf8" {
1634+ test "invalid utf8" {
16361635 try testTokenize ("//\x80 " , &.{
16371636 .invalid ,
16381637 });
@@ -1659,7 +1658,7 @@ test "tokenizer - invalid utf8" {
16591658 });
16601659}
16611660
1662- test "tokenizer - illegal unicode codepoints" {
1661+ test "illegal unicode codepoints" {
16631662 // unicode newline characters.U+0085, U+2028, U+2029
16641663 try testTokenize ("//\xc2\x84 " , &.{});
16651664 try testTokenize ("//\xc2\x85 " , &.{
@@ -1676,7 +1675,7 @@ test "tokenizer - illegal unicode codepoints" {
16761675 try testTokenize ("//\xe2\x80\xaa " , &.{});
16771676}
16781677
1679- test "tokenizer - string identifier and builtin fns" {
1678+ test "string identifier and builtin fns" {
16801679 try testTokenize (
16811680 \\const @"if" = @import("std");
16821681 , &.{
@@ -1691,15 +1690,15 @@ test "tokenizer - string identifier and builtin fns" {
16911690 });
16921691}
16931692
1694- test "tokenizer - multiline string literal with literal tab" {
1693+ test "multiline string literal with literal tab" {
16951694 try testTokenize (
16961695 \\\\foo bar
16971696 , &.{
16981697 .multiline_string_literal_line ,
16991698 });
17001699}
17011700
1702- test "tokenizer - comments with literal tab" {
1701+ test "comments with literal tab" {
17031702 try testTokenize (
17041703 \\//foo bar
17051704 \\//!foo bar
@@ -1715,14 +1714,14 @@ test "tokenizer - comments with literal tab" {
17151714 });
17161715}
17171716
1718- test "tokenizer - pipe and then invalid" {
1717+ test "pipe and then invalid" {
17191718 try testTokenize ("||=" , &.{
17201719 .pipe_pipe ,
17211720 .equal ,
17221721 });
17231722}
17241723
1725- test "tokenizer - line comment and doc comment" {
1724+ test "line comment and doc comment" {
17261725 try testTokenize ("//" , &.{});
17271726 try testTokenize ("// a / b" , &.{});
17281727 try testTokenize ("// /" , &.{});
@@ -1733,7 +1732,7 @@ test "tokenizer - line comment and doc comment" {
17331732 try testTokenize ("//!!" , &.{.container_doc_comment });
17341733}
17351734
1736- test "tokenizer - line comment followed by identifier" {
1735+ test "line comment followed by identifier" {
17371736 try testTokenize (
17381737 \\ Unexpected,
17391738 \\ // another
@@ -1746,7 +1745,7 @@ test "tokenizer - line comment followed by identifier" {
17461745 });
17471746}
17481747
1749- test "tokenizer - UTF-8 BOM is recognized and skipped" {
1748+ test "UTF-8 BOM is recognized and skipped" {
17501749 try testTokenize ("\xEF\xBB\xBF a;\n " , &.{
17511750 .identifier ,
17521751 .semicolon ,
@@ -1788,15 +1787,15 @@ test "correctly parse pointer dereference followed by asterisk" {
17881787 });
17891788}
17901789
1791- test "tokenizer - range literals" {
1790+ test "range literals" {
17921791 try testTokenize ("0...9" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
17931792 try testTokenize ("'0'...'9'" , &.{ .char_literal , .ellipsis3 , .char_literal });
17941793 try testTokenize ("0x00...0x09" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
17951794 try testTokenize ("0b00...0b11" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
17961795 try testTokenize ("0o00...0o11" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
17971796}
17981797
1799- test "tokenizer - number literals decimal" {
1798+ test "number literals decimal" {
18001799 try testTokenize ("0" , &.{.integer_literal });
18011800 try testTokenize ("1" , &.{.integer_literal });
18021801 try testTokenize ("2" , &.{.integer_literal });
@@ -1863,7 +1862,7 @@ test "tokenizer - number literals decimal" {
18631862 try testTokenize ("1.0e0_+" , &.{ .invalid , .plus });
18641863}
18651864
1866- test "tokenizer - number literals binary" {
1865+ test "number literals binary" {
18671866 try testTokenize ("0b0" , &.{.integer_literal });
18681867 try testTokenize ("0b1" , &.{.integer_literal });
18691868 try testTokenize ("0b2" , &.{ .invalid , .integer_literal });
@@ -1902,7 +1901,7 @@ test "tokenizer - number literals binary" {
19021901 try testTokenize ("0b1_," , &.{ .invalid , .comma });
19031902}
19041903
1905- test "tokenizer - number literals octal" {
1904+ test "number literals octal" {
19061905 try testTokenize ("0o0" , &.{.integer_literal });
19071906 try testTokenize ("0o1" , &.{.integer_literal });
19081907 try testTokenize ("0o2" , &.{.integer_literal });
@@ -1941,7 +1940,7 @@ test "tokenizer - number literals octal" {
19411940 try testTokenize ("0o_," , &.{ .invalid , .identifier , .comma });
19421941}
19431942
1944- test "tokenizer - number literals hexadecimal" {
1943+ test "number literals hexadecimal" {
19451944 try testTokenize ("0x0" , &.{.integer_literal });
19461945 try testTokenize ("0x1" , &.{.integer_literal });
19471946 try testTokenize ("0x2" , &.{.integer_literal });
@@ -2029,22 +2028,22 @@ test "tokenizer - number literals hexadecimal" {
20292028 try testTokenize ("0x0.0p0_" , &.{ .invalid , .eof });
20302029}
20312030
2032- test "tokenizer - multi line string literal with only 1 backslash" {
2031+ test "multi line string literal with only 1 backslash" {
20332032 try testTokenize ("x \\ \n ;" , &.{ .identifier , .invalid , .semicolon });
20342033}
20352034
2036- test "tokenizer - invalid builtin identifiers" {
2035+ test "invalid builtin identifiers" {
20372036 try testTokenize ("@()" , &.{ .invalid , .l_paren , .r_paren });
20382037 try testTokenize ("@0()" , &.{ .invalid , .integer_literal , .l_paren , .r_paren });
20392038}
20402039
2041- test "tokenizer - invalid token with unfinished escape right before eof" {
2040+ test "invalid token with unfinished escape right before eof" {
20422041 try testTokenize ("\" \\ " , &.{.invalid });
20432042 try testTokenize ("'\\ " , &.{.invalid });
20442043 try testTokenize ("'\\ u" , &.{.invalid });
20452044}
20462045
2047- test "tokenizer - saturating" {
2046+ test "saturating operators " {
20482047 try testTokenize ("<<" , &.{.angle_bracket_angle_bracket_left });
20492048 try testTokenize ("<<|" , &.{.angle_bracket_angle_bracket_left_pipe });
20502049 try testTokenize ("<<|=" , &.{.angle_bracket_angle_bracket_left_pipe_equal });
@@ -2062,17 +2061,14 @@ test "tokenizer - saturating" {
20622061 try testTokenize ("-|=" , &.{.minus_pipe_equal });
20632062}
20642063
2065- fn testTokenize (source : [:0 ]const u8 , expected_tokens : []const Token.Tag ) ! void {
2064+ fn testTokenize (source : [:0 ]const u8 , expected_token_tags : []const Token.Tag ) ! void {
20662065 var tokenizer = Tokenizer .init (source );
2067- for (expected_tokens ) | expected_token_id | {
2066+ for (expected_token_tags ) | expected_token_tag | {
20682067 const token = tokenizer .next ();
2069- if (token .tag != expected_token_id ) {
2070- std .debug .panic ("expected {s}, found {s}\n " , .{
2071- @tagName (expected_token_id ), @tagName (token .tag ),
2072- });
2073- }
2068+ try std .testing .expectEqual (expected_token_tag , token .tag );
20742069 }
20752070 const last_token = tokenizer .next ();
20762071 try std .testing .expectEqual (Token .Tag .eof , last_token .tag );
20772072 try std .testing .expectEqual (source .len , last_token .loc .start );
2073+ try std .testing .expectEqual (source .len , last_token .loc .end );
20782074}
0 commit comments