diff --git a/utf8/valid.go b/utf8/valid.go index edad758..2ebc949 100644 --- a/utf8/valid.go +++ b/utf8/valid.go @@ -9,7 +9,7 @@ import ( type Validation byte const ( - Invalid = 0 + Invalid = 0b00 UTF8 = 0b01 ASCII = 0b10 | UTF8 ) diff --git a/utf8/valid_amd64.go b/utf8/valid_amd64.go index c18cff6..8f1c741 100644 --- a/utf8/valid_amd64.go +++ b/utf8/valid_amd64.go @@ -1,6 +1,6 @@ // Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. -//go:build !purego +//go:build !purego || amd64 package utf8 diff --git a/utf8/valid_arm64.go b/utf8/valid_arm64.go new file mode 100644 index 0000000..c25c85c --- /dev/null +++ b/utf8/valid_arm64.go @@ -0,0 +1,6 @@ +//go:build !purego || arm64 + +package utf8 + +// Optimized version of Validate for inputs of more than 32B. +func validateNEON(p []byte) byte diff --git a/utf8/valid_arm64.s b/utf8/valid_arm64.s new file mode 100644 index 0000000..662a4e0 --- /dev/null +++ b/utf8/valid_arm64.s @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: MIT-0 + +//go:build !purego + +#include "textflag.h" + +// func validateNEON(p []byte) byte +TEXT ·validateNEON(SB),NOSPLIT,$0-25 + MOVD s_base+0(FP), R10 + MOVD s_len+8(FP), R11 + CBZ R11, valid + CMP $16, R11 + BLT small + + VMOVQ $0x8080808080808080, $0x8080808080808080, V0 + +ascii_loop: + CMP $16, R11 + BLT small + + VLD1 (R10), [V1.B16] + VCMTST V1.B16, V0.B16, V2.B16 + VMOV V2.D[0], R2 + VMOV V2.D[1], R3 + ORR R2, R3, R2 + CBNZ R2, stop_ascii + + ADD $16, R10 + SUB $16, R11 + B ascii_loop + +stop_ascii: + VMOVQ $0x0202020202020202, $0x4915012180808080, V11 + VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13 + VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15 + VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18 + VMOVQ $0x0707070707070707, $0x0707070707070707, V12 + VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14 + VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16 + VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17 + VMOVQ $0x0808080808080808, $0x0808080808080808, V19 + VMOVQ $0x8080808080808080, $0x8080808080808080, V20 + VMOVQ $0x0000000000000000, $0x0000000000000000, V30 + VMOVQ $0x0000000000000000, $0x0000000000000000, V3 + +aligned_loop: + VLD1.P 16(R10), [V4.B16] + VEXT $15, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V6.B16 + VTBL V6.B16, [V11.B16], V6.B16 + VAND V5.B16, V18.B16, V7.B16 + VTBL V7.B16, [V13.B16], V7.B16 + VUSHR $4, V4.B16, V8.B16 + VTBL V8.B16, [V15.B16], V8.B16 + VAND V6.B16, V7.B16, V9.B16 + VAND V9.B16, V8.B16, V10.B16 + VEXT $14, V4.B16, V3.B16, V5.B16 + VUSHR $5, V5.B16, V6.B16 + VCMEQ V12.B16, V6.B16, V6.B16 + VEXT $13, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V9.B16 + VCMEQ V18.B16, V9.B16, V9.B16 + VORR V6.B16, V9.B16, V9.B16 + VAND V9.B16, V20.B16, V9.B16 + VSUB V9.B16, V10.B16, V9.B16 + VMOV V9.D[0], R1 + VMOV V9.D[1], R2 + ORR R1, R2, R1 + CBNZ R1, no_valid + VMOV V4.B16, V3.B16 + SUB $16, R11, R11 + CMP $16, R11 + + BGE aligned_loop + + B small_no_const + +small: + CBZ R11, valid_ascii + +tail_loop: + MOVBU (R10), R2 + AND $0x80, R2 + CBNZ R2, check_utf8 + ADD $1, R10 + SUB $1, R11 + CBNZ R11, tail_loop + B valid_ascii + + +check_utf8: + + VMOVQ $0x0202020202020202, $0x4915012180808080, V11 + VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13 + VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15 + VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18 + VMOVQ $0x0707070707070707, $0x0707070707070707, V12 + VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14 + VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16 + VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17 + VMOVQ $0x0808080808080808, $0x0808080808080808, V19 + VMOVQ $0x8080808080808080, $0x8080808080808080, V20 + VMOVQ $0x0000000000000000, $0x0000000000000000, V30 + VMOVQ $0x0000000000000000, $0x0000000000000000, V3 + +small_no_const: + + SUB $16, R10, R10 + ADD R11, R10, R10 + VLD1.P 16(R10), [V4.B16] + + ADR shift_table, R2 + MOVW R11, R3 + LSL $2, R3 + ADD R3, R2 + B (R2) + + +shift_table: + B do_shift_0 + B do_shift_1 + B do_shift_2 + B do_shift_3 + B do_shift_4 + B do_shift_5 + B do_shift_6 + B do_shift_7 + B do_shift_8 + B do_shift_9 + B do_shift_10 + B do_shift_11 + B do_shift_12 + B do_shift_13 + B do_shift_14 + B do_shift_15 + +do_shift_0: + VMOVQ $0x6161616161616161, $0x6161616161616161, V4 + B end_swith +do_shift_1: + VEXT $15, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_2: + VEXT $14, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_3: + VEXT $13, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_4: + VEXT $12, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_5: + VEXT $11, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_6: + VEXT $10, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_7: + VEXT $9, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_8: + VEXT $8, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_9: + VEXT $7, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_10: + VEXT $6, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_11: + VEXT $5, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_12: + VEXT $4, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_13: + VEXT $3, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_14: + VEXT $2, V30.B16, V4.B16, V4.B16 + B end_swith +do_shift_15: + VEXT $1, V30.B16, V4.B16, V4.B16 + B end_swith + +end_swith: + VEXT $15, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V6.B16 + VTBL V6.B16, [V11.B16], V6.B16 + VAND V5.B16, V18.B16, V7.B16 + VTBL V7.B16, [V13.B16], V7.B16 + VUSHR $4, V4.B16, V8.B16 + VTBL V8.B16, [V15.B16], V8.B16 + VAND V6.B16, V7.B16, V9.B16 + VAND V9.B16, V8.B16, V10.B16 + + VEXT $14, V4.B16, V3.B16, V5.B16 + VUSHR $5, V5.B16, V6.B16 + VCMEQ V12.B16, V6.B16, V6.B16 + + VEXT $13, V4.B16, V3.B16, V5.B16 + VUSHR $4, V5.B16, V9.B16 + VCMEQ V18.B16, V9.B16, V9.B16 + VORR V6.B16, V9.B16, V9.B16 + + VAND V9.B16, V20.B16, V9.B16 + VSUB V9.B16, V10.B16, V9.B16 + VMOV V9.D[0], R1 + VMOV V9.D[1], R2 + ORR R1, R2, R1 + CBNZ R1, no_valid + +valid: + MOVD $1, R0 + MOVD R0, ret+24(FP) + RET + +no_valid: + MOVD $0, R0 + MOVD R0, ret+24(FP) + RET + +valid_ascii: + MOVD $3, R0 + MOVD R0, ret+24(FP) + RET + + diff --git a/utf8/valid_default.go b/utf8/valid_default.go index 3301e9a..9543955 100644 --- a/utf8/valid_default.go +++ b/utf8/valid_default.go @@ -1,5 +1,5 @@ -//go:build purego || !amd64 -// +build purego !amd64 +//go:build purego +// +build purego package utf8 diff --git a/utf8/valid_support_amd64.go b/utf8/valid_support_amd64.go index c3e8328..876119b 100644 --- a/utf8/valid_support_amd64.go +++ b/utf8/valid_support_amd64.go @@ -1,5 +1,5 @@ -//go:build !purego -// +build !purego +//go:build !purego || amd64 +// +build !purego amd64 package utf8 diff --git a/utf8/valid_support_arm64.go b/utf8/valid_support_arm64.go new file mode 100644 index 0000000..b602961 --- /dev/null +++ b/utf8/valid_support_arm64.go @@ -0,0 +1,21 @@ +//go:build !purego || arm64 +// +build !purego arm64 + +package utf8 + +import ( + "github.com/segmentio/asm/cpu" + "github.com/segmentio/asm/cpu/arm64" +) + +var noNEON = !cpu.ARM64.Has(arm64.ASIMD) + +// Validate is a more precise version of Valid that also indicates whether the +// input was valid ASCII. +func Validate(p []byte) Validation { + if noNEON || len(p) < 32 { + return validate(p) + } + r := validateNEON(p) + return Validation(r) +} diff --git a/utf8/valid_test.go b/utf8/valid_test.go index cd1f82a..a4fe886 100644 --- a/utf8/valid_test.go +++ b/utf8/valid_test.go @@ -253,7 +253,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF") func BenchmarkValid(b *testing.B) { impls := map[string]func([]byte) bool{ - "AVX": Valid, + "SIMD": Valid, "Stdlib": utf8.Valid, }