From d9b0258d76d19a0b317a91a7372bd543f8716fa2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 3 Nov 2024 01:22:51 +0300 Subject: [PATCH 1/7] 1 --- utf8/test.out.txt | 0 utf8/valid.go | 2 +- utf8/valid_amd64.go | 2 +- utf8/valid_arm64.go | 8 ++ utf8/valid_arm64.s | 226 ++++++++++++++++++++++++++++++++++++ utf8/valid_default.go | 4 +- utf8/valid_support_amd64.go | 4 +- utf8/valid_support_arm64.go | 21 ++++ utf8/valid_test.go | 142 +++++++++++----------- 9 files changed, 335 insertions(+), 74 deletions(-) create mode 100644 utf8/test.out.txt create mode 100644 utf8/valid_arm64.go create mode 100644 utf8/valid_arm64.s create mode 100644 utf8/valid_support_arm64.go diff --git a/utf8/test.out.txt b/utf8/test.out.txt new file mode 100644 index 00000000..e69de29b diff --git a/utf8/valid.go b/utf8/valid.go index edad758f..2ebc949b 100644 --- a/utf8/valid.go +++ b/utf8/valid.go @@ -9,7 +9,7 @@ import ( type Validation byte const ( - Invalid = 0 + Invalid = 0b00 UTF8 = 0b01 ASCII = 0b10 | UTF8 ) diff --git a/utf8/valid_amd64.go b/utf8/valid_amd64.go index c18cff67..8f1c741b 100644 --- a/utf8/valid_amd64.go +++ b/utf8/valid_amd64.go @@ -1,6 +1,6 @@ // Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. -//go:build !purego +//go:build !purego || amd64 package utf8 diff --git a/utf8/valid_arm64.go b/utf8/valid_arm64.go new file mode 100644 index 00000000..2200c6a8 --- /dev/null +++ b/utf8/valid_arm64.go @@ -0,0 +1,8 @@ +// Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. + +//go:build !purego || arm64 + +package utf8 + +// Optimized version of Validate for inputs of more than 32B. +func validateNEON(p []byte) byte diff --git a/utf8/valid_arm64.s b/utf8/valid_arm64.s new file mode 100644 index 00000000..7bc95c45 --- /dev/null +++ b/utf8/valid_arm64.s @@ -0,0 +1,226 @@ +// TODO: license + +//go:build !purego + +#include "textflag.h" + +// func validateNEON(p []byte) byte +TEXT ·validateNEON(SB),NOSPLIT,$0-25 + MOVD s_base+0(FP), R10 + MOVD s_len+8(FP), R11 + CBZ R11, valid + CMP $16, R11 + BLT small + + VMOVQ $0x8080808080808080, $0x8080808080808080, V0 + +ascii_loop: + CMP $16, R11 + BLT small + + VLD1 (R10), [V1.B16] + VCMTST V1.B16, V0.B16, V2.B16 + VMOV V2.D[0], R2 + VMOV V2.D[1], R3 + ORR R2, R3, R2 + CBNZ R2, stop_ascii + + ADD $16, R10 + SUB $16, R11 + B ascii_loop + +stop_ascii: + VMOVQ $0x0202020202020202, $0x4915012180808080, V11 + VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13 + VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15 + VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18 + VMOVQ $0x0707070707070707, $0x0707070707070707, V12 + VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14 + VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16 + VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17 + VMOVQ $0x0808080808080808, $0x0808080808080808, V19 + VMOVQ $0x8080808080808080, $0x8080808080808080, V20 + VMOVQ $0x0000000000000000, $0x0000000000000000, V30 + VMOVQ $0x0000000000000000, $0x0000000000000000, V3 + +aligned_loop: + VLD1.P 16(R10), [V4.B16] + VEXT $15, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V6.B16 + VTBL V6.B16, [V11.B16], V6.B16 + VAND V5.B16, V18.B16, V7.B16 + VTBL V7.B16, [V13.B16], V7.B16 + VUSHR $4, V4.B16, V8.B16 + VTBL V8.B16, [V15.B16], V8.B16 + VAND V6.B16, V7.B16, V9.B16 + VAND V9.B16, V8.B16, V10.B16 + VEXT $14, V4.B16, V3.B16, V5.B16 + VUSHR $5, V5.B16, V6.B16 + VCMEQ V12.B16, V6.B16, V6.B16 + VEXT $13, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V9.B16 + VCMEQ V18.B16, V9.B16, V9.B16 + VORR V6.B16, V9.B16, V9.B16 + VAND V9.B16, V20.B16, V9.B16 + VSUB V9.B16, V10.B16, V9.B16 + VMOV V9.D[0], R1 + VMOV V9.D[1], R2 + ORR R1, R2, R1 + CBNZ R1, no_valid + VMOV V4.B16, V3.B16 + SUB $16, R11, R11 + CMP $16, R11 + + BGE aligned_loop + + B small_no_const + +small: + VMOVQ $0x0202020202020202, $0x4915012180808080, V11 + VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13 + VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15 + VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18 + VMOVQ $0x0707070707070707, $0x0707070707070707, V12 + VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14 + VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16 + VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17 + VMOVQ $0x0808080808080808, $0x0808080808080808, V19 + VMOVQ $0x8080808080808080, $0x8080808080808080, V20 + VMOVQ $0x0000000000000000, $0x0000000000000000, V30 + VMOVQ $0x0000000000000000, $0x0000000000000000, V3 + +small_no_const: + + SUB $16, R10, R10 + ADD R11, R10, R10 + VLD1.P 16(R10), [V4.B16] + + ADR shift_table, R2 + MOVW R11, R3 + LSL $2, R3 + ADD R3, R2 + B (R2) + + +shift_table: + B do_shift_0 + B do_shift_1 + B do_shift_2 + B do_shift_3 + B do_shift_4 + B do_shift_5 + B do_shift_6 + B do_shift_7 + B do_shift_8 + B do_shift_9 + B do_shift_10 + B do_shift_11 + B do_shift_12 + B do_shift_13 + B do_shift_14 + B do_shift_15 + +do_shift_0: + VMOVQ $0x6161616161616161, $0x6161616161616161, V4 + B end_swith +do_shift_1: + VEXT $15, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_2: + VEXT $14, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_3: + VEXT $13, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_4: + VEXT $12, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_5: + VEXT $11, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_6: + VEXT $10, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_7: + VEXT $9, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_8: + VEXT $8, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_9: + VEXT $7, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_10: + VEXT $6, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_11: + VEXT $5, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_12: + VEXT $4, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_13: + VEXT $3, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_14: + VEXT $2, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_15: + VEXT $1, V30.B16, V4.B16, V4.B16 + B end_swith + +end_swith: + VEXT $15, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V6.B16 + VTBL V6.B16, [V11.B16], V6.B16 + VAND V5.B16, V18.B16, V7.B16 + VTBL V7.B16, [V13.B16], V7.B16 + VUSHR $4, V4.B16, V8.B16 + VTBL V8.B16, [V15.B16], V8.B16 + VAND V6.B16, V7.B16, V9.B16 + VAND V9.B16, V8.B16, V10.B16 + + VEXT $14, V4.B16, V3.B16, V5.B16 + VUSHR $5, V5.B16, V6.B16 + VCMEQ V12.B16, V6.B16, V6.B16 + + VEXT $13, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V9.B16 + VCMEQ V18.B16, V9.B16, V9.B16 + VORR V6.B16, V9.B16, V9.B16 + + VAND V9.B16, V20.B16, V9.B16 + VSUB V9.B16, V10.B16, V9.B16 + VMOV V9.D[0], R1 + VMOV V9.D[1], R2 + ORR R1, R2, R1 + CBNZ R1, no_valid + +valid: + MOVD $1, R0 + MOVD R0, ret+24(FP) + RET + +no_valid: + MOVD $0, R0 + MOVD R0, ret+24(FP) + RET + + +end_7: + MOVD $7, R0 + MOVD R0, ret+24(FP) + RET + +end_R11: + MOVD R11, R0 + MOVD R0, ret+24(FP) + RET + + +ret7: + MOVD $7, R0 + MOVD R0, ret+24(FP) // Возвращаем 0 (строка не валидна) + RET +/////////////////////////// + diff --git a/utf8/valid_default.go b/utf8/valid_default.go index 3301e9a6..95439556 100644 --- a/utf8/valid_default.go +++ b/utf8/valid_default.go @@ -1,5 +1,5 @@ -//go:build purego || !amd64 -// +build purego !amd64 +//go:build purego +// +build purego package utf8 diff --git a/utf8/valid_support_amd64.go b/utf8/valid_support_amd64.go index c3e83289..876119b2 100644 --- a/utf8/valid_support_amd64.go +++ b/utf8/valid_support_amd64.go @@ -1,5 +1,5 @@ -//go:build !purego -// +build !purego +//go:build !purego || amd64 +// +build !purego amd64 package utf8 diff --git a/utf8/valid_support_arm64.go b/utf8/valid_support_arm64.go new file mode 100644 index 00000000..b6029612 --- /dev/null +++ b/utf8/valid_support_arm64.go @@ -0,0 +1,21 @@ +//go:build !purego || arm64 +// +build !purego arm64 + +package utf8 + +import ( + "github.com/segmentio/asm/cpu" + "github.com/segmentio/asm/cpu/arm64" +) + +var noNEON = !cpu.ARM64.Has(arm64.ASIMD) + +// Validate is a more precise version of Valid that also indicates whether the +// input was valid ASCII. +func Validate(p []byte) Validation { + if noNEON || len(p) < 32 { + return validate(p) + } + r := validateNEON(p) + return Validation(r) +} diff --git a/utf8/valid_test.go b/utf8/valid_test.go index cd1f82aa..f16b8ed2 100644 --- a/utf8/valid_test.go +++ b/utf8/valid_test.go @@ -46,6 +46,7 @@ func genExamples(current string, ranges []byteRange) []string { } func TestValid(t *testing.T) { + var examples = []string{ // Tests copied from the stdlib "", @@ -56,85 +57,85 @@ func TestValid(t *testing.T) { "брэд-ЛГТМ", "☺☻☹", - // overlong - "\xE0\x80", - // unfinished continuation - "aa\xE2", + // // overlong + // "\xE0\x80", + // // unfinished continuation + // "aa\xE2", - string([]byte{66, 250}), + // string([]byte{66, 250}), - string([]byte{66, 250, 67}), + // string([]byte{66, 250, 67}), - "a\uFFFDb", + // "a\uFFFDb", - "\xF4\x8F\xBF\xBF", // U+10FFFF + // "\xF4\x8F\xBF\xBF", // U+10FFFF - "\xF4\x90\x80\x80", // U+10FFFF+1; out of range - "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range + // "\xF4\x90\x80\x80", // U+10FFFF+1; out of range + // "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range - "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range + // "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range - "\xc0\x80", // U+0000 encoded in two bytes: incorrect - "\xed\xa0\x80", // U+D800 high surrogate (sic) - "\xed\xbf\xbf", // U+DFFF low surrogate (sic) + // "\xc0\x80", // U+0000 encoded in two bytes: incorrect + // "\xed\xa0\x80", // U+D800 high surrogate (sic) + // "\xed\xbf\xbf", // U+DFFF low surrogate (sic) - // valid at boundary - strings.Repeat("a", 32+28) + "☺☻☹", - strings.Repeat("a", 32+29) + "☺☻☹", - strings.Repeat("a", 32+30) + "☺☻☹", - strings.Repeat("a", 32+31) + "☺☻☹", - // invalid at boundary - strings.Repeat("a", 32+31) + "\xE2a", + // // valid at boundary + // strings.Repeat("a", 32+28) + "☺☻☹", + // strings.Repeat("a", 32+29) + "☺☻☹", + // strings.Repeat("a", 32+30) + "☺☻☹", + // strings.Repeat("a", 32+31) + "☺☻☹", + // // invalid at boundary + // strings.Repeat("a", 32+31) + "\xE2a", - // same inputs as benchmarks - "0123456789", - "日本語日本語日本語日", - "\xF4\x8F\xBF\xBF", + // // same inputs as benchmarks + // "0123456789", + // "日本語日本語日本語日", + // "\xF4\x8F\xBF\xBF", - // bugs found with fuzzing - "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60", - "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300", - "߀0000000000000000000000000000訨", - "0000000000000000000000000000000˂00000000000000000000000000000000", + // // bugs found with fuzzing + // "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60", + // "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300", + // "߀0000000000000000000000000000訨", + // "0000000000000000000000000000000˂00000000000000000000000000000000", } - any := byteRange{0, 0xFF} - ascii := byteRange{0, 0x7F} - cont := byteRange{0x80, 0xBF} + // any := byteRange{0, 0xFF} + // ascii := byteRange{0, 0x7F} + // cont := byteRange{0x80, 0xBF} rangesToTest := [][]byteRange{ - {one(0x20), ascii, ascii, ascii}, - - // 2-byte sequences - {one(0xC2)}, - {one(0xC2), ascii}, - {one(0xC2), cont}, - {one(0xC2), {0xC0, 0xFF}}, - {one(0xC2), cont, cont}, - {one(0xC2), cont, cont, cont}, - - // 3-byte sequences - {one(0xE1)}, - {one(0xE1), cont}, - {one(0xE1), cont, cont}, - {one(0xE1), cont, cont, ascii}, - {one(0xE1), cont, ascii}, - {one(0xE1), cont, cont, cont}, - - // 4-byte sequences - {one(0xF1)}, - {one(0xF1), cont}, - {one(0xF1), cont, cont}, - {one(0xF1), cont, cont, cont}, - {one(0xF1), cont, cont, ascii}, - {one(0xF1), cont, cont, cont, ascii}, - - // overlong - {{0xC0, 0xC1}, any}, - {{0xC0, 0xC1}, any, any}, - {{0xC0, 0xC1}, any, any, any}, - {one(0xE0), {0x0, 0x9F}, cont}, - {one(0xE0), {0xA0, 0xBF}, cont}, + // {one(0x20), ascii, ascii, ascii}, + + // // 2-byte sequences + // {one(0xC2)}, + // {one(0xC2), ascii}, + // {one(0xC2), cont}, + // {one(0xC2), {0xC0, 0xFF}}, + // {one(0xC2), cont, cont}, + // {one(0xC2), cont, cont, cont}, + + // // 3-byte sequences + // {one(0xE1)}, + // {one(0xE1), cont}, + // {one(0xE1), cont, cont}, + // {one(0xE1), cont, cont, ascii}, + // {one(0xE1), cont, ascii}, + // {one(0xE1), cont, cont, cont}, + + // // 4-byte sequences + // {one(0xF1)}, + // {one(0xF1), cont}, + // {one(0xF1), cont, cont}, + // {one(0xF1), cont, cont, cont}, + // {one(0xF1), cont, cont, ascii}, + // {one(0xF1), cont, cont, cont, ascii}, + + // // overlong + // {{0xC0, 0xC1}, any}, + // {{0xC0, 0xC1}, any, any}, + // {{0xC0, 0xC1}, any, any, any}, + // {one(0xE0), {0x0, 0x9F}, cont}, + // {one(0xE0), {0xA0, 0xBF}, cont}, } for _, r := range rangesToTest { @@ -164,7 +165,7 @@ func TestValid(t *testing.T) { t.Run("boundary-"+tt, func(t *testing.T) { size := 32 - len(tt) - prefix := strings.Repeat("a", size) + prefix := strings.Repeat("q", size) b := []byte(prefix + tt) check(t, b) }) @@ -194,6 +195,7 @@ func TestValid(t *testing.T) { } func TestValidPageBoundary(t *testing.T) { + buf, err := buffer.New(64) if err != nil { t.Fatal(err) @@ -231,7 +233,8 @@ func check(t *testing.T, b []byte) { if err != nil { panic(err) } - + fmt.Println("qwe\tValid(b)", Valid(b)) + fmt.Println("qwe\tutf8.Valid(b)", utf8.Valid(b)) t.Errorf("Valid(%q) = %v; want %v", string(b), !expected, expected) } @@ -243,6 +246,9 @@ func check(t *testing.T, b []byte) { expected = ascii.Valid(b) if v.IsASCII() != expected { + // t.Errorf("qwe\tValid(b) %q", ascii.Valid(b)) + t.Errorf("qwe\tascii.Valid(b) %v", ascii.Valid(b)) + t.Errorf("qwe\tascii.Valid(b) %v", Valid(b)) t.Errorf("Validate(%q) ascii valid: %v; want %v", string(b), !expected, expected) } } @@ -253,7 +259,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF") func BenchmarkValid(b *testing.B) { impls := map[string]func([]byte) bool{ - "AVX": Valid, + "SIMD": Valid, "Stdlib": utf8.Valid, } From fe95c46a04c921b20ed5e26cfa3dda557f1e7116 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 3 Nov 2024 02:59:59 +0300 Subject: [PATCH 2/7] 1 --- utf8/test.out.txt | 1 + utf8/valid_arm64.s | 30 ++++++---- utf8/valid_test.go | 145 ++++++++++++++++++++++----------------------- 3 files changed, 89 insertions(+), 87 deletions(-) diff --git a/utf8/test.out.txt b/utf8/test.out.txt index e69de29b..c18c3cd0 100644 --- a/utf8/test.out.txt +++ b/utf8/test.out.txt @@ -0,0 +1 @@ +􏿿􏿿􏿿􏿿􏿿􏿿􏿿􏿿 \ No newline at end of file diff --git a/utf8/valid_arm64.s b/utf8/valid_arm64.s index 7bc95c45..f3e29ae9 100644 --- a/utf8/valid_arm64.s +++ b/utf8/valid_arm64.s @@ -76,6 +76,20 @@ aligned_loop: B small_no_const small: + CBZ R11, valid_ascii + +tail_loop: + MOVBU (R10), R2 + AND $0x80, R2 + CBNZ R2, check_utf8 + ADD $1, R10 + SUB $1, R11 + CBNZ R11, tail_loop + B valid_ascii + + +check_utf8: + VMOVQ $0x0202020202020202, $0x4915012180808080, V11 VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13 VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15 @@ -95,7 +109,7 @@ small_no_const: ADD R11, R10, R10 VLD1.P 16(R10), [V4.B16] - ADR shift_table, R2 + ADR shift_table, R2 MOVW R11, R3 LSL $2, R3 ADD R3, R2 @@ -206,21 +220,11 @@ no_valid: MOVD R0, ret+24(FP) RET - -end_7: - MOVD $7, R0 +valid_ascii: + MOVD $3, R0 MOVD R0, ret+24(FP) RET -end_R11: - MOVD R11, R0 - MOVD R0, ret+24(FP) - RET - -ret7: - MOVD $7, R0 - MOVD R0, ret+24(FP) // Возвращаем 0 (строка не валидна) - RET /////////////////////////// diff --git a/utf8/valid_test.go b/utf8/valid_test.go index f16b8ed2..0c6eb63a 100644 --- a/utf8/valid_test.go +++ b/utf8/valid_test.go @@ -46,7 +46,6 @@ func genExamples(current string, ranges []byteRange) []string { } func TestValid(t *testing.T) { - var examples = []string{ // Tests copied from the stdlib "", @@ -57,85 +56,87 @@ func TestValid(t *testing.T) { "брэд-ЛГТМ", "☺☻☹", - // // overlong - // "\xE0\x80", - // // unfinished continuation - // "aa\xE2", + // overlong + "\xE0\x80", + // unfinished continuation + "aa\xE2", - // string([]byte{66, 250}), + string([]byte{66, 250}), - // string([]byte{66, 250, 67}), + string([]byte{66, 250, 67}), - // "a\uFFFDb", + "a\uFFFDb", - // "\xF4\x8F\xBF\xBF", // U+10FFFF + "\xF4\x8F\xBF\xBF", // U+10FFFF - // "\xF4\x90\x80\x80", // U+10FFFF+1; out of range - // "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range + "\xF4\x90\x80\x80", // U+10FFFF+1; out of range + "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range - // "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range + "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range - // "\xc0\x80", // U+0000 encoded in two bytes: incorrect - // "\xed\xa0\x80", // U+D800 high surrogate (sic) - // "\xed\xbf\xbf", // U+DFFF low surrogate (sic) + "\xc0\x80", // U+0000 encoded in two bytes: incorrect + "\xed\xa0\x80", // U+D800 high surrogate (sic) + "\xed\xbf\xbf", // U+DFFF low surrogate (sic) - // // valid at boundary - // strings.Repeat("a", 32+28) + "☺☻☹", - // strings.Repeat("a", 32+29) + "☺☻☹", - // strings.Repeat("a", 32+30) + "☺☻☹", - // strings.Repeat("a", 32+31) + "☺☻☹", - // // invalid at boundary - // strings.Repeat("a", 32+31) + "\xE2a", + // valid at boundary + strings.Repeat("a", 32+28) + "☺☻☹", + strings.Repeat("a", 32+29) + "☺☻☹", + strings.Repeat("a", 32+30) + "☺☻☹", + strings.Repeat("a", 32+31) + "☺☻☹", + // invalid at boundary + strings.Repeat("a", 32+31) + "\xE2a", - // // same inputs as benchmarks - // "0123456789", - // "日本語日本語日本語日", - // "\xF4\x8F\xBF\xBF", + // same inputs as benchmarks + "0123456789", + "日本語日本語日本語日", + "\xF4\x8F\xBF\xBF", - // // bugs found with fuzzing - // "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60", - // "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300", - // "߀0000000000000000000000000000訨", - // "0000000000000000000000000000000˂00000000000000000000000000000000", + // bugs found with fuzzing + "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60", + "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300", + "߀0000000000000000000000000000訨", + "0000000000000000000000000000000˂00000000000000000000000000000000", } - // any := byteRange{0, 0xFF} - // ascii := byteRange{0, 0x7F} - // cont := byteRange{0x80, 0xBF} + any := byteRange{0, 0xFF} + ascii := byteRange{0, 0x7F} + cont := byteRange{0x80, 0xBF} rangesToTest := [][]byteRange{ - // {one(0x20), ascii, ascii, ascii}, - - // // 2-byte sequences - // {one(0xC2)}, - // {one(0xC2), ascii}, - // {one(0xC2), cont}, - // {one(0xC2), {0xC0, 0xFF}}, - // {one(0xC2), cont, cont}, - // {one(0xC2), cont, cont, cont}, - - // // 3-byte sequences - // {one(0xE1)}, - // {one(0xE1), cont}, - // {one(0xE1), cont, cont}, - // {one(0xE1), cont, cont, ascii}, - // {one(0xE1), cont, ascii}, - // {one(0xE1), cont, cont, cont}, - - // // 4-byte sequences - // {one(0xF1)}, - // {one(0xF1), cont}, - // {one(0xF1), cont, cont}, - // {one(0xF1), cont, cont, cont}, - // {one(0xF1), cont, cont, ascii}, - // {one(0xF1), cont, cont, cont, ascii}, - - // // overlong - // {{0xC0, 0xC1}, any}, - // {{0xC0, 0xC1}, any, any}, - // {{0xC0, 0xC1}, any, any, any}, - // {one(0xE0), {0x0, 0x9F}, cont}, - // {one(0xE0), {0xA0, 0xBF}, cont}, + {one(0x20), ascii, ascii, ascii}, + + {one(0x04), ascii, ascii, ascii}, + + // 2-byte sequences + {one(0xC2)}, + {one(0xC2), ascii}, + {one(0xC2), cont}, + {one(0xC2), {0xC0, 0xFF}}, + {one(0xC2), cont, cont}, + {one(0xC2), cont, cont, cont}, + + // 3-byte sequences + {one(0xE1)}, + {one(0xE1), cont}, + {one(0xE1), cont, cont}, + {one(0xE1), cont, cont, ascii}, + {one(0xE1), cont, ascii}, + {one(0xE1), cont, cont, cont}, + + // 4-byte sequences + {one(0xF1)}, + {one(0xF1), cont}, + {one(0xF1), cont, cont}, + {one(0xF1), cont, cont, cont}, + {one(0xF1), cont, cont, ascii}, + {one(0xF1), cont, cont, cont, ascii}, + + // overlong + {{0xC0, 0xC1}, any}, + {{0xC0, 0xC1}, any, any}, + {{0xC0, 0xC1}, any, any, any}, + {one(0xE0), {0x0, 0x9F}, cont}, + {one(0xE0), {0xA0, 0xBF}, cont}, } for _, r := range rangesToTest { @@ -165,7 +166,7 @@ func TestValid(t *testing.T) { t.Run("boundary-"+tt, func(t *testing.T) { size := 32 - len(tt) - prefix := strings.Repeat("q", size) + prefix := strings.Repeat("a", size) b := []byte(prefix + tt) check(t, b) }) @@ -195,7 +196,6 @@ func TestValid(t *testing.T) { } func TestValidPageBoundary(t *testing.T) { - buf, err := buffer.New(64) if err != nil { t.Fatal(err) @@ -233,8 +233,7 @@ func check(t *testing.T, b []byte) { if err != nil { panic(err) } - fmt.Println("qwe\tValid(b)", Valid(b)) - fmt.Println("qwe\tutf8.Valid(b)", utf8.Valid(b)) + t.Errorf("Valid(%q) = %v; want %v", string(b), !expected, expected) } @@ -246,9 +245,7 @@ func check(t *testing.T, b []byte) { expected = ascii.Valid(b) if v.IsASCII() != expected { - // t.Errorf("qwe\tValid(b) %q", ascii.Valid(b)) - t.Errorf("qwe\tascii.Valid(b) %v", ascii.Valid(b)) - t.Errorf("qwe\tascii.Valid(b) %v", Valid(b)) + t.Errorf("STRING(%q): %v", b, string(b)) t.Errorf("Validate(%q) ascii valid: %v; want %v", string(b), !expected, expected) } } @@ -259,7 +256,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF") func BenchmarkValid(b *testing.B) { impls := map[string]func([]byte) bool{ - "SIMD": Valid, + "AVX": Valid, "Stdlib": utf8.Valid, } From c073ed344364d35381f14d89243905afc0aef19c Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 3 Nov 2024 03:03:23 +0300 Subject: [PATCH 3/7] 1 --- utf8/valid_arm64.s | 2 -- 1 file changed, 2 deletions(-) diff --git a/utf8/valid_arm64.s b/utf8/valid_arm64.s index f3e29ae9..89e989c2 100644 --- a/utf8/valid_arm64.s +++ b/utf8/valid_arm64.s @@ -226,5 +226,3 @@ valid_ascii: RET -/////////////////////////// - From 808960d1b83b7ea0727bbaf8431393f3cad27d72 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 3 Nov 2024 03:07:14 +0300 Subject: [PATCH 4/7] 1 --- utf8/valid_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/utf8/valid_test.go b/utf8/valid_test.go index 0c6eb63a..cd1f82aa 100644 --- a/utf8/valid_test.go +++ b/utf8/valid_test.go @@ -105,8 +105,6 @@ func TestValid(t *testing.T) { rangesToTest := [][]byteRange{ {one(0x20), ascii, ascii, ascii}, - {one(0x04), ascii, ascii, ascii}, - // 2-byte sequences {one(0xC2)}, {one(0xC2), ascii}, @@ -245,7 +243,6 @@ func check(t *testing.T, b []byte) { expected = ascii.Valid(b) if v.IsASCII() != expected { - t.Errorf("STRING(%q): %v", b, string(b)) t.Errorf("Validate(%q) ascii valid: %v; want %v", string(b), !expected, expected) } } From 7d300e2f9693b80bce05ea24138ca028a2a8ec86 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 3 Nov 2024 03:24:34 +0300 Subject: [PATCH 5/7] 1 --- utf8/valid_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utf8/valid_test.go b/utf8/valid_test.go index cd1f82aa..a4fe8869 100644 --- a/utf8/valid_test.go +++ b/utf8/valid_test.go @@ -253,7 +253,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF") func BenchmarkValid(b *testing.B) { impls := map[string]func([]byte) bool{ - "AVX": Valid, + "SIMD": Valid, "Stdlib": utf8.Valid, } From 53e0bdb270fb03f0b5444497e1c471b82e3f85d3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 7 Nov 2024 13:36:44 +0300 Subject: [PATCH 6/7] fix --- utf8/test.out.txt | 1 - utf8/valid_arm64.s | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 utf8/test.out.txt diff --git a/utf8/test.out.txt b/utf8/test.out.txt deleted file mode 100644 index c18c3cd0..00000000 --- a/utf8/test.out.txt +++ /dev/null @@ -1 +0,0 @@ -􏿿􏿿􏿿􏿿􏿿􏿿􏿿􏿿 \ No newline at end of file diff --git a/utf8/valid_arm64.s b/utf8/valid_arm64.s index 89e989c2..662a4e00 100644 --- a/utf8/valid_arm64.s +++ b/utf8/valid_arm64.s @@ -1,4 +1,4 @@ -// TODO: license +// SPDX-License-Identifier: MIT-0 //go:build !purego From cb605785233dd2a2cb60c58f4cf3cf9981da5296 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 8 Nov 2024 14:53:26 +0300 Subject: [PATCH 7/7] m --- utf8/valid_arm64.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utf8/valid_arm64.go b/utf8/valid_arm64.go index 2200c6a8..c25c85ca 100644 --- a/utf8/valid_arm64.go +++ b/utf8/valid_arm64.go @@ -1,6 +1,4 @@ -// Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. - -//go:build !purego || arm64 +//go:build !purego || arm64 package utf8