diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b852110a32..28c15cf473 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -260,6 +260,7 @@ jobs: - aarch64_be-unknown-linux-gnu - armv7-unknown-linux-gnueabihf - arm-unknown-linux-gnueabihf + - x86_64-unknown-linux-gnu profile: [dev, release] include: - target: aarch64_be-unknown-linux-gnu diff --git a/Cargo.lock b/Cargo.lock index ff503f3035..70f09adf2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -347,8 +347,11 @@ dependencies = [ "itertools", "log", "pretty_env_logger", + "quick-xml 0.37.5", "rayon", + "regex", "serde", + "serde-xml-rs", "serde_json", ] @@ -452,6 +455,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quickcheck" version = "1.0.3" @@ -587,6 +600,18 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-xml-rs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53630160a98edebde0123eb4dfd0fce6adff091b2305db3154a9e920206eb510" +dependencies = [ + "log", + "serde", + "thiserror", + "xml-rs", +] + [[package]] name = "serde_derive" version = "1.0.219" @@ -698,7 +723,7 @@ name = "stdarch-verify" version = "0.1.0" dependencies = [ "proc-macro2", - "quick-xml", + "quick-xml 0.33.0", "quote", "serde", "serde_json", @@ -746,6 +771,26 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "unicode-ident" version = "1.0.18" @@ -958,6 +1003,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +[[package]] +name = "xml-rs" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7" + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile index bbebe2d7fa..2743896375 100644 --- a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile +++ b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile @@ -6,7 +6,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ make \ ca-certificates \ wget \ - xz-utils + xz-utils \ + clang \ + libstdc++-14-dev \ + build-essential \ + lld RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz RUN mkdir intel-sde diff --git a/ci/intrinsic-test.sh b/ci/intrinsic-test.sh index 469e9e21c7..e14a824b2a 100755 --- a/ci/intrinsic-test.sh +++ b/ci/intrinsic-test.sh @@ -66,6 +66,14 @@ case ${TARGET} in TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}" ;; + + x86_64-unknown-linux-gnu*) + TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" + TEST_CXX_COMPILER="clang++" + TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" + TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt + TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 + ;; *) ;; @@ -94,6 +102,22 @@ case "${TARGET}" in --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \ --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}" ;; + + x86_64-unknown-linux-gnu*) + # `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER` is not necessary for `intrinsic-test` + # because the binary needs to run directly on the host. + # Hence the use of `env -u`. + env -u CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER \ + CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" \ + RUST_LOG=warn RUST_BACKTRACE=1 \ + cargo run "${INTRINSIC_TEST}" "${PROFILE}" \ + --bin intrinsic-test -- intrinsics_data/x86-intel.xml \ + --runner "${TEST_RUNNER}" \ + --skip "${TEST_SKIP_INTRINSICS}" \ + --cppcompiler "${TEST_CXX_COMPILER}" \ + --target "${TARGET}" \ + --sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}" + ;; *) ;; esac diff --git a/crates/intrinsic-test/Cargo.toml b/crates/intrinsic-test/Cargo.toml index fbbf90e140..2c0f53897e 100644 --- a/crates/intrinsic-test/Cargo.toml +++ b/crates/intrinsic-test/Cargo.toml @@ -19,3 +19,6 @@ pretty_env_logger = "0.5.0" rayon = "1.5.0" diff = "0.1.12" itertools = "0.14.0" +quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] } +serde-xml-rs = "0.8.0" +regex = "1.11.1" diff --git a/crates/intrinsic-test/missing_x86.txt b/crates/intrinsic-test/missing_x86.txt new file mode 100644 index 0000000000..58e37b92a1 --- /dev/null +++ b/crates/intrinsic-test/missing_x86.txt @@ -0,0 +1,904 @@ +# Are defined under a similar name + +#__bswap_64 +_bswap64 + +# Provides pointer to allocated memory, which is difficult to test +_mm_malloc + +# requires target feature 'waitpkg', but would be inlined into function that is compiled without support for 'waitpkg' +_tpause +_umwait + +# `use of undeclared identifier` error in Clang +_bit_scan_forward +_bit_scan_reverse +_bswap +_castf32_u32 +_castf64_u64 +_castu32_f32 +_castu64_f64 +_lrotl +_lrotr +_may_i_use_cpu_feature +_may_i_use_cpu_feature_ext +_mm256_acos_pd +_mm256_acos_ph +_mm256_acos_ps +_mm256_acosh_pd +_mm256_acosh_ph +_mm256_acosh_ps +_mm256_asin_pd +_mm256_asin_ph +_mm256_asin_ps +_mm256_asinh_pd +_mm256_asinh_ph +_mm256_asinh_ps +_mm256_atan_pd +_mm256_atan_ps +_mm256_atan_ph +_mm256_atan2_pd +_mm256_atan2_ph +_mm256_atan2_ps +_mm256_atanh_pd +_mm256_atanh_ph +_mm256_atanh_ps +_mm256_cbrt_pd +_mm256_cbrt_ph +_mm256_cbrt_ps +_mm256_cdfnorm_pd +_mm256_cdfnorm_ph +_mm256_cdfnorm_ps +_mm256_cdfnorminv_pd +_mm256_cdfnorminv_ph +_mm256_cdfnorminv_ps +_mm256_cexp_ps +_mm256_cos_pd +_mm256_cos_ph +_mm256_cos_ps +_mm256_cosd_pd +_mm256_cosd_ph +_mm256_cosd_ps +_mm256_cosh_pd +_mm256_cosh_ph +_mm256_cosh_ps +_mm256_csqrt_ps +_mm256_div_epi16 +_mm256_div_epi32 +_mm256_div_epi64 +_mm256_div_epi8 +_mm256_div_epu16 +_mm256_div_epu32 +_mm256_div_epu64 +_mm256_div_epu8 +_mm256_dpbssd_epi32 +_mm256_dpbssds_epi32 +_mm256_dpbsud_epi32 +_mm256_dpbsuds_epi32 +_mm256_dpbuud_epi32 +_mm256_dpbuuds_epi32 +_mm256_dpwsud_epi32 +_mm256_dpwsuds_epi32 +_mm256_dpwusd_epi32 +_mm256_dpwusds_epi32 +_mm256_dpwuud_epi32 +_mm256_dpwuuds_epi32 +_mm256_erf_pd +_mm256_erf_ps +_mm256_erfc_pd +_mm256_erfc_ph +_mm256_erfc_ps +_mm256_erfcinv_pd +_mm256_erfcinv_ph +_mm256_erfcinv_ps +_mm256_erfinv_pd +_mm256_erfinv_ph +_mm256_erfinv_ps +_mm256_exp10_pd +_mm256_exp10_ph +_mm256_exp10_ps +_mm256_exp2_pd +_mm256_exp2_ph +_mm256_exp2_ps +_mm256_exp_pd +_mm256_exp_ph +_mm256_exp_ps +_mm256_expm1_pd +_mm256_expm1_ph +_mm256_expm1_ps +_mm256_hypot_pd +_mm256_hypot_ph +_mm256_hypot_ps +_mm256_idiv_epi32 +_mm256_invcbrt_pd +_mm256_invcbrt_ph +_mm256_invcbrt_ps +_mm256_invsqrt_pd +_mm256_invsqrt_ph +_mm256_invsqrt_ps +_mm256_irem_epi32 +_mm256_log10_pd +_mm256_log10_ph +_mm256_log10_ps +_mm256_log1p_pd +_mm256_log1p_ph +_mm256_log1p_ps +_mm256_log2_pd +_mm256_log2_ph +_mm256_log2_ps +_mm256_log_pd +_mm256_log_ph +_mm256_log_ps +_mm256_logb_pd +_mm256_logb_ph +_mm256_logb_ps +_mm256_clog_ps +_mm256_madd52hi_avx_epu64 +_mm256_madd52lo_avx_epu64 +_mm256_erf_ph +_mm256_mask_reduce_add_epi16 +_mm256_mask_reduce_add_epi8 +_mm256_mask_reduce_and_epi16 +_mm256_mask_reduce_and_epi8 +_mm256_mask_reduce_max_epi16 +_mm256_mask_reduce_max_epi8 +_mm256_mask_reduce_max_epu16 +_mm256_mask_reduce_max_epu8 +_mm256_mask_reduce_min_epi16 +_mm256_mask_reduce_min_epi8 +_mm256_mask_reduce_min_epu16 +_mm256_mask_reduce_min_epu8 +_mm256_mask_reduce_mul_epi16 +_mm256_mask_reduce_mul_epi8 +_mm256_mask_reduce_or_epi16 +_mm256_mask_reduce_or_epi8 +_mm512_cosd_ph +_mm512_cosd_ps +_mm512_cosh_pd +_mm512_cosh_ph +_mm512_cosh_ps +_mm512_div_epi16 +_mm512_div_epi32 +_mm512_div_epi64 +_mm512_div_epi8 +_mm512_div_epu16 +_mm512_div_epu32 +_mm512_div_epu64 +_mm512_div_epu8 +_mm512_erf_pd +_mm512_erf_ph +_mm512_erf_ps +_mm512_erfc_pd +_mm512_erfc_ph +_mm512_erfc_ps +_mm512_erfcinv_pd +_mm512_erfcinv_ph +_mm512_erfcinv_ps +_mm512_erfinv_pd +_mm512_erfinv_ph +_mm512_erfinv_ps +_mm512_exp10_pd +_mm512_exp10_ph +_mm512_exp10_ps +_mm512_exp2_pd +_mm512_exp2_ph +_mm512_exp2_ps +_mm512_exp_pd +_mm512_exp_ph +_mm512_exp_ps +_mm512_expm1_pd +_mm512_expm1_ph +_mm512_expm1_ps +_mm512_floor_ph +_mm512_hypot_pd +_mm512_hypot_ph +_mm512_hypot_ps +_mm512_invsqrt_pd +_mm512_invsqrt_ph +_mm512_invsqrt_ps +_mm512_log10_pd +_mm512_log10_ph +_mm512_log10_ps +_mm512_log1p_pd +_mm512_log1p_ph +_mm512_log1p_ps +_mm512_log2_pd +_mm512_log2_ph +_mm512_log2_ps +_mm512_log_pd +_mm512_log_ph +_mm512_log_ps +_mm512_logb_pd +_mm512_logb_ph +_mm512_logb_ps +_mm512_mask_acos_pd +_mm512_mask_acos_ph +_mm512_mask_acos_ps +_mm512_mask_acosh_pd +_mm512_mask_acosh_ph +_mm512_mask_acosh_ps +_mm512_mask_asin_pd +_mm512_mask_asin_ph +_mm512_mask_asin_ps +_mm512_mask_asinh_pd +_mm512_mask_asinh_ph +_mm512_mask_asinh_ps +_mm512_mask_atan2_pd +_mm512_mask_atan2_ps +_mm512_mask_atan_pd +_mm512_mask_atan_ph +_mm512_mask_atan_ph +_mm512_mask_atanh_pd +_mm512_mask_atanh_ph +_mm512_mask_atanh_ps +_mm512_mask_cbrt_pd +_mm512_mask_cbrt_ph +_mm512_mask_cbrt_ps +_mm512_mask_cdfnorm_pd +_mm512_mask_cdfnorm_ph +_mm512_mask_cdfnorm_ps +_mm512_mask_cdfnorminv_pd +_mm512_mask_cdfnorminv_ph +_mm512_mask_cdfnorminv_ps +_mm512_mask_ceil_ph +_mm512_mask_cos_pd +_mm512_mask_cos_ph +_mm512_mask_cos_ps +_mm512_mask_cosd_pd +_mm512_mask_cosd_ph +_mm512_mask_cosd_ps +_mm512_mask_cosh_pd +_mm512_mask_cosh_ph +_mm512_mask_cosh_ps +_mm512_mask_atan_ps +_mm512_cosd_pd +_mm512_cos_ps +_mm512_cos_ph +_mm512_cos_pd +_mm512_mask_div_epi32 +_mm512_mask_div_epu32 +_mm512_mask_erf_pd +_mm512_mask_erf_ph +_mm512_mask_erf_ps +_mm512_mask_erfc_pd +_mm512_mask_erfc_ph +_mm512_mask_erfc_ps +_mm512_mask_erfcinv_pd +_mm512_mask_erfcinv_ph +_mm512_mask_erfcinv_ps +_mm512_mask_erfinv_pd +_mm512_mask_erfinv_ph +_mm512_mask_erfinv_ps +_mm512_mask_exp10_pd +_mm512_mask_exp10_ph +_mm512_mask_exp10_ps +_mm512_mask_exp2_pd +_mm512_mask_exp2_ph +_mm512_mask_exp2_ps +_mm512_mask_exp_pd +_mm512_mask_exp_ph +_mm512_mask_exp_ps +_mm512_mask_expm1_pd +_mm512_mask_expm1_ph +_mm512_mask_expm1_ps +_mm512_mask_floor_ph +_mm512_mask_hypot_pd +_mm512_mask_hypot_ps +_mm512_mask_invsqrt_pd +_mm512_mask_invsqrt_ph +_mm512_mask_invsqrt_ps +_mm512_mask_log10_pd +_mm512_mask_log10_ph +_mm512_mask_log10_ps +_mm512_mask_log1p_pd +_mm512_mask_log1p_ph +_mm512_mask_log1p_ps +_mm512_mask_log2_pd +_mm512_mask_log2_ph +_mm512_mask_log2_ps +_mm512_mask_log_pd +_mm512_mask_log_ph +_mm512_mask_log_ps +_mm512_mask_logb_pd +_mm512_mask_logb_ph +_mm512_mask_logb_ps +_mm512_mask_nearbyint_pd +_mm512_mask_nearbyint_ph +_mm512_mask_nearbyint_ps +_mm512_mask_pow_pd +_mm512_mask_pow_ps +_mm512_mask_recip_pd +_mm512_mask_recip_ph +_mm512_mask_recip_ps +_mm512_mask_rem_epi32 +_mm512_mask_rem_epu32 +_mm512_mask_rint_pd +_mm512_mask_rint_ph +_mm512_mask_rint_ps +_mm512_mask_sin_pd +_mm512_mask_sin_ph +_mm512_mask_sin_ps +_mm512_mask_sind_pd +_mm512_mask_sind_ph +_mm512_mask_sind_ps +_mm512_mask_sinh_pd +_mm512_mask_sinh_ph +_mm512_mask_sinh_ps +_mm512_mask_svml_round_pd +_mm512_mask_svml_round_ph +_mm512_mask_tan_pd +_mm512_mask_tan_ph +_mm512_mask_tan_ps +_mm512_mask_tand_pd +_mm512_mask_tand_ph +_mm512_mask_tand_ps +_mm512_mask_tanh_pd +_mm512_mask_tanh_ph +_mm512_mask_tanh_ps +_mm512_mask_trunc_pd +_mm512_mask_trunc_ph +_mm512_mask_trunc_ps +_mm512_nearbyint_pd +_mm512_nearbyint_ph +_mm512_nearbyint_ps +_mm512_pow_pd +_mm512_pow_ph +_mm512_pow_ps +_mm512_recip_pd +_mm512_recip_ph +_mm512_recip_ps +_mm512_rem_epi16 +_mm512_rem_epi32 +_mm512_rem_epi64 +_mm512_rem_epi8 +_mm512_rem_epu16 +_mm512_rem_epu32 +_mm512_rem_epu64 +_mm512_rem_epu8 +_mm512_rint_pd +_mm512_rint_ph +_mm512_rint_ps +_mm512_sin_pd +_mm512_sin_ph +_mm512_sin_ps +_mm512_sind_pd +_mm512_sind_ph +_mm512_sind_ps +_mm512_sinh_pd +_mm512_sinh_ph +_mm512_sinh_ps +_mm512_svml_round_pd +_mm512_svml_round_ph +_mm512_tan_pd +_mm512_tan_ph +_mm512_tan_ps +_mm512_tand_pd +_mm512_tand_ph +_mm512_tand_ps +_mm512_tanh_pd +_mm512_tanh_ph +_mm512_tanh_ps +_mm512_trunc_pd +_mm512_trunc_ph +_mm512_trunc_ps +_mm_acos_pd +_mm_acos_ph +_mm_acos_ps +_mm_acosh_pd +_mm_acosh_ph +_mm_acosh_ps +_mm_asin_pd +_mm_asin_ph +_mm_asin_ps +_mm_asinh_pd +_mm_asinh_ph +_mm_asinh_ps +_mm_atan2_pd +_mm_atan2_ph +_mm_atan2_ps +_mm_atan_pd +_mm_atan_ph +_mm_atan_ps +_mm_atanh_pd +_mm_atanh_ph +_mm_atanh_ps +_mm_cbrt_pd +_mm_cbrt_ph +_mm_cbrt_ps +_mm_cdfnorm_pd +_mm_cdfnorm_ph +_mm_cdfnorm_ps +_mm_cdfnorminv_pd +_mm_cdfnorminv_ph +_mm_cdfnorminv_ps +_mm_cexp_ps +_mm_clog_ps +_mm_cos_pd +_mm_cos_ph +_mm_cos_ps +_mm_cosd_pd +_mm_cosd_ph +_mm_cosd_ps +_mm_cosh_pd +_mm_cosh_ph +_mm_cosh_ps +_mm_csqrt_ps +_mm_cvtsd_si64x +_mm_cvtsi128_si64x +_mm_cvtsi64x_sd +_mm_cvtsi64x_si128 +_mm_cvttsd_si64x +_mm_div_epi16 +_mm_div_epi32 +_mm_div_epi64 +_mm_div_epi8 +_mm_div_epu16 +_mm_div_epu32 +_mm_div_epu64 +_mm_div_epu8 +_mm_dpbssd_epi32 +_mm_dpbssds_epi32 +_mm_dpbsud_epi32 +_mm_dpbsuds_epi32 +_mm_dpbuud_epi32 +_mm_dpbuuds_epi32 +_mm_dpwsud_epi32 +_mm_dpwsuds_epi32 +_mm_dpwusd_epi32 +_mm_dpwusds_epi32 +_mm_dpwuud_epi32 +_mm_dpwuuds_epi32 +_mm_erf_pd +_mm_erf_ph +_mm_erf_ps +_mm_erfc_pd +_mm_erfc_ph +_mm_erfc_ps +_mm_erfcinv_pd +_mm_erfcinv_ph +_mm_erfcinv_ps +_mm_erfinv_pd +_mm_erfinv_ph +_mm_erfinv_ps +_mm_exp10_pd +_mm_exp10_ph +_mm_exp10_ps +_mm_exp2_pd +_mm_exp2_ph +_mm_exp2_ps +_mm_exp_pd +_mm_exp_ph +_mm_exp_ps +_mm_expm1_pd +_mm_expm1_ph +_mm_expm1_ps +_mm_hypot_pd +_mm_hypot_ph +_mm_hypot_ps +_mm_idiv_epi32 +_mm_invcbrt_pd +_mm_invcbrt_ph +_mm_invcbrt_ps +_mm_invsqrt_pd +_mm_invsqrt_ph +_mm_invsqrt_ps +_mm_irem_epi32 +_mm_log10_pd +_mm_log10_ph +_mm_log10_ps +_mm_log1p_pd +_mm_log1p_ph +_mm_log1p_ps +_mm_log2_pd +_mm_log2_ph +_mm_log2_ps +_mm_log_pd +_mm_log_ph +_mm_log_ps +_mm_logb_pd +_mm_logb_ph +_mm_logb_ps +_mm_madd52hi_avx_epu64 +_mm_madd52lo_avx_epu64 +_mm_mask_reduce_add_epi16 +_mm_mask_reduce_add_epi8 +_mm_mask_reduce_and_epi16 +_mm_mask_reduce_and_epi8 +_mm_mask_reduce_max_epi16 +_mm_mask_reduce_max_epi8 +_mm_mask_reduce_max_epu16 +_mm_mask_reduce_max_epu8 +_mm_mask_reduce_min_epi16 +_mm_mask_reduce_min_epi8 +_mm_mask_reduce_min_epu16 +_mm_mask_reduce_min_epu8 +_mm_mask_reduce_mul_epi16 +_mm_mask_reduce_mul_epi8 +_mm_mask_reduce_or_epi16 +_mm_mask_reduce_or_epi8 +_mm_pow_pd +_mm_pow_ph +_mm_pow_ps +_mm_reduce_add_epi16 +_mm_reduce_add_epi8 +_mm_reduce_and_epi16 +_mm_reduce_and_epi8 +_mm_reduce_max_epi16 +_mm_reduce_max_epi8 +_mm_reduce_max_epu16 +_mm_reduce_max_epu8 +_mm_reduce_min_epi16 +_mm_reduce_min_epi8 +_mm_reduce_min_epu16 +_mm_reduce_min_epu8 +_mm_reduce_mul_epi16 +_mm_reduce_mul_epi8 +_mm_reduce_or_epi16 +_mm_reduce_or_epi8 +_mm_rem_epi16 +_mm_rem_epi32 +_mm_rem_epi64 +_mm_rem_epi8 +_mm_rem_epu16 +_mm_rem_epu32 +_mm_rem_epu64 +_mm_rem_epu8 +_mm_sin_pd +_mm_sin_ph +_mm_sin_ps +_mm_sind_pd +_mm_sind_ph +_mm_sind_ps +_mm_sinh_pd +_mm_sinh_ph +_mm_sinh_ps +_mm_sm3msg1_epi32 +_mm_sm3msg2_epi32 +_mm_sm3rnds2_epi32 +_mm_sm4key4_epi32 +_mm_sm4rnds4_epi32 +_mm_svml_ceil_pd +_mm_svml_ceil_ph +_mm_svml_ceil_ps +_mm_svml_floor_pd +_mm_svml_floor_ph +_mm_svml_floor_ps +_mm_svml_round_pd +_mm_svml_round_ph +_mm_svml_round_ps +_mm_svml_sqrt_pd +_mm_svml_sqrt_ph +_mm_svml_sqrt_ps +_mm_tan_pd +_mm_tan_ph +_mm_tan_ps +_mm_tand_pd +_mm_tand_ph +_mm_tand_ps +_mm_tanh_pd +_mm_tanh_ph +_mm_tanh_ps +_mm_trunc_pd +_mm_trunc_ph +_mm_trunc_ps +_mm_udiv_epi32 +_mm_urem_epi32 +_popcnt32 +_popcnt64 +_rdpmc +_rotl +_rotl64 +_rotr +_rotr64 +_rotwl +_rotwr +_urdmsr + +# Cannot find value in this scope (in Rust testfiles) +_mm512_set1_pch +_mm_abs_pi16 +_mm_abs_pi32 +_mm_abs_pi8 +_mm_add_pi16 +_mm_add_pi32 +_mm_add_pi8 +_mm_add_si64 +_mm_adds_pi16 +_mm_adds_pi8 +_mm_adds_pu16 +_mm_adds_pu8 +_mm_alignr_pi8 +_mm_and_si64 +_mm_andnot_si64 +_mm_avg_pu16 +_mm_avg_pu8 +_mm_cmpeq_pi16 +_mm_cmpeq_pi32 +_mm_cmpeq_pi8 +_mm_cmpgt_pi16 +_mm_cmpgt_pi32 +_mm_cmpgt_pi8 +_mm_cvt_pi2ps +_mm_cvt_ps2pi +_mm_cvtm64_si64 +_mm_cvtpd_pi32 +_mm_cvtpi16_ps +_mm_cvtpi32_pd +_mm_cvtpi32_ps +_mm_cvtpi32x2_ps +_mm_cvtpi8_ps +_mm_cvtps_pi16 +_mm_cvtps_pi32 +_mm_cvtps_pi8 +_mm_cvtpu16_ps +_mm_cvtpu8_ps +_mm_cvtsi32_si64 +_mm_cvtsi64_m64 +_mm_cvtsi64_si32 +_mm_cvtt_ps2pi +_mm_cvttpd_pi32 +_mm512_cbrt_pd +_mm512_cbrt_ph +_mm512_cbrt_ps +_mm512_cdfnorm_pd +_mm512_cdfnorm_ph +_mm512_cdfnorm_ps +_mm512_cdfnorminv_pd +_mm512_cdfnorminv_ph +_mm512_cdfnorminv_ps +_mm512_ceil_pd +_mm512_ceil_ph +_mm512_ceil_ps +_mm512_floor_pd +_mm512_floor_ps +_mm512_mask_ceil_pd +_mm512_mask_ceil_ps +_mm_max_pi16 +_mm_max_pu8 +_mm_min_pi16 +_mm_min_pu8 +_mm_movemask_pi8 +_mm_movepi64_pi64 +_mm_movpi64_epi64 +_mm_mul_su32 +_mm_mulhi_pi16 +_mm_mulhi_pu16 +_mm_mulhrs_pi16 +_mm_mullo_pi16 +_mm_or_si64 +_mm_packs_pi16 +_mm_packs_pi32 +_mm_packs_pu16 +_mm_popcnt_u32 +_mm_popcnt_u64 +_mm_sad_pu8 +_mm_set1_epi64 +_mm_set1_pch +_mm_set1_pi16 +_mm_set1_pi32 +_mm_set1_pi8 +_mm_set_epi64 +_mm_set_pi16 +_mm_set_pi32 +_mm_set_pi8 +_mm_setr_epi64 +_mm_setr_pi16 +_mm_setr_pi32 +_mm_setr_pi8 +_mm_shuffle_pi16 +_mm_shuffle_pi8 +_mm_sign_pi16 +_mm_sign_pi32 +_mm_sign_pi8 +_mm_sll_pi16 +_mm_sll_pi32 +_mm_sll_si64 +_mm_slli_pi16 +_mm_slli_pi32 +_mm_slli_si64 +_mm_sra_pi16 +_mm_sra_pi32 +_mm_srai_pi16 +_mm_srai_pi32 +_mm_srl_pi16 +_mm_srl_pi32 +_mm_srl_si64 +_mm_srli_pi16 +_mm_srli_pi32 +_mm_srli_si64 +_mm_sub_pi16 +_mm_sub_pi32 +_mm_sub_pi8 +_mm_sub_si64 +_mm_subs_pi16 +_mm_subs_pi8 +_mm_subs_pu16 +_mm_subs_pu8 +_mm_unpackhi_pi16 +_mm_unpackhi_pi32 +_mm_unpackhi_pi8 +_mm_unpacklo_pi16 +_mm_unpacklo_pi32 +_mm_unpacklo_pi8 +_mm_xor_si64 +_mm256_pow_pd +_mm256_pow_ph +_mm256_pow_ps +_mm256_rem_epi16 +_mm256_rem_epi32 +_mm256_rem_epi64 +_mm256_rem_epi8 +_mm256_rem_epu16 +_mm256_rem_epu32 +_mm256_rem_epu64 +_mm256_rem_epu8 +_mm256_set1_pch +_mm256_sin_pd +_mm256_sin_ph +_mm256_sin_ps +_mm256_sind_pd +_mm256_sind_ph +_mm256_sind_ps +_mm256_sinh_pd +_mm256_sinh_ph +_mm256_sinh_ps +_mm256_svml_ceil_pd +_mm256_svml_ceil_ph +_mm256_svml_ceil_ps +_mm256_svml_floor_pd +_mm256_svml_floor_ph +_mm256_svml_floor_ps +_mm256_svml_round_pd +_mm256_svml_round_ph +_mm256_svml_round_ps +_mm256_svml_sqrt_pd +_mm256_svml_sqrt_ph +_mm256_svml_sqrt_ps +_mm256_tan_pd +_mm256_tan_ph +_mm256_tan_ps +_mm256_tand_pd +_mm256_tand_ph +_mm256_tand_ps +_mm256_tanh_pd +_mm256_tanh_ph +_mm256_tanh_ps +_mm256_trunc_pd +_mm256_trunc_ph +_mm256_trunc_ps +_mm256_udiv_epi32 +_mm256_urem_epi32 +_mm512_acos_pd +_mm512_acos_ph +_mm512_acos_ps +_mm512_acosh_pd +_mm512_acosh_ph +_mm512_acosh_ps +_mm_cvttps_pi32 +_mm_extract_pi16 +_mm_hadd_pi16 +_mm_hadd_pi32 +_mm_hadds_pi16 +_mm_hsub_pi16 +_mm_hsub_pi32 +_mm_hsubs_pi16 +_mm_insert_pi16 +_mm_madd_pi16 +_mm_maddubs_pi16 +_mm512_asin_pd +_mm512_asin_ph +_mm512_asin_ps +_mm512_asinh_pd +_mm512_asinh_ph +_mm512_asinh_ps +_mm512_atan2_pd +_mm512_atan2_ph +_mm512_atan2_ps +_mm512_atan_pd +_mm512_atan_ph +_mm512_atan_ps +_mm512_atanh_pd +_mm512_atanh_ph +_mm512_atanh_ps +_cvtsh_ss +_cvtss_sh +_m_from_int +_m_from_int64 +_m_packssdw +_m_packsswb +_m_packuswb +_m_paddb +_m_paddd +_m_paddsb +_m_paddsw +_m_paddusb +_m_paddusw +_m_paddw +_m_pand +_m_pandn +_m_pavgb +_m_pavgw +_m_pcmpeqb +_m_pcmpeqd +_m_pcmpeqw +_m_pcmpgtb +_m_pcmpgtd +_m_pcmpgtw +_m_pextrw +_m_pinsrw +_m_pmaddwd +_m_pmaxsw +_m_pmaxub +_m_pminsw +_m_pminub +_m_pmovmskb +_m_pmulhuw +_m_pmulhw +_m_pmullw +_m_por +_m_psadbw +_m_pshufw +_m_pslld +_m_pslldi +_m_psllq +_m_psllqi +_m_psllw +_m_psllwi +_m_psrad +_m_psradi +_m_psraw +_m_psrawi +_m_psrld +_m_psrldi +_m_psrlq +_m_psrlqi +_m_psrlw +_m_psrlwi +_m_psubb +_m_psubd +_m_psubsb +_m_psubsw +_m_psubusb +_m_psubusw +_m_psubw +_m_punpckhbw +_m_punpckhdq +_m_punpckhwd +_m_punpcklbw +_m_punpckldq +_m_punpcklwd +_m_pxor +_m_to_int +_m_to_int64 +_mm512_mask_floor_pd +_mm512_mask_floor_ps + +# SDE ERROR: Cannot execute XGETBV with ECX != 0 +_xgetbv + +# Miscellaneous issues that can be fixed first +_kshiftli_mask16 +_kshiftli_mask32 +_kshiftli_mask64 +_kshiftli_mask8 +_kshiftri_mask16 +_kshiftri_mask32 +_kshiftri_mask64 +_kshiftri_mask8 +_mm256_castsi128_si256 +_mm256_extract_epi16 +_mm256_extract_epi8 +_mm512_castsi128_si512 +_mm512_castsi256_si512 +# _mm512_conj_pch +_mm512_mask_reduce_max_pd +_mm512_mask_reduce_max_ps +_mm512_mask_reduce_min_pd +_mm512_mask_reduce_min_ps +_mm_comineq_sh +_mm_extract_epi16 +_mm_extract_epi8 +_mm_mask_cvtepi16_epi8 +_mm_mask_cvtpd_epi32 +_mm_mask_cvtpd_ps +_mm_ucomineq_sh \ No newline at end of file diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index d9024eabfa..a634645969 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -3,13 +3,24 @@ pub const NOTICE: &str = "\ // test are derived from a JSON specification, published under the same license as the // `intrinsic-test` crate.\n"; -pub const POLY128_OSTREAM_DECL: &str = r#" +pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value); #endif + +std::ostream& operator<<(std::ostream& os, float16_t value); +std::ostream& operator<<(std::ostream& os, uint8_t value); + +// T1 is the `To` type, T2 is the `From` type +template T1 cast(T2 x) { + static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); + T1 ret{}; + memcpy(&ret, &x, sizeof(T1)); + return ret; +} "#; -pub const POLY128_OSTREAM_DEF: &str = r#" +pub const PLATFORM_C_DEFINITIONS: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value) { std::stringstream temp; @@ -23,11 +34,26 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) { os << res; return os; } + #endif + +std::ostream& operator<<(std::ostream& os, float16_t value) { + uint16_t temp = 0; + memcpy(&temp, &value, sizeof(float16_t)); + std::stringstream ss; + ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, uint8_t value) { + os << (unsigned int) value; + return os; +} "#; // Format f16 values (and vectors containing them) in a way that is consistent with C. -pub const F16_FORMATTING_DEF: &str = r#" +pub const PLATFORM_RUST_DEFINITIONS: &str = r#" /// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they /// were before moving to array-based simd. #[inline] @@ -113,7 +139,7 @@ impl DebugHexF16 for float16x8x4_t { } "#; -pub const AARCH_CONFIGURATIONS: &str = r#" +pub const PLATFORM_RUST_CFGS: &str = r#" #![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))] #![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))] #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))] diff --git a/crates/intrinsic-test/src/arm/mod.rs b/crates/intrinsic-test/src/arm/mod.rs index 08dc2d3870..7fa5062e86 100644 --- a/crates/intrinsic-test/src/arm/mod.rs +++ b/crates/intrinsic-test/src/arm/mod.rs @@ -32,11 +32,11 @@ impl SupportedArchitectureTest for ArmArchitectureTest { const NOTICE: &str = config::NOTICE; const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"]; - const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF; - const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL; + const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS; - const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; - const PLATFORM_RUST_CFGS: &str = config::AARCH_CONFIGURATIONS; + const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS; + const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS; fn cpp_compilation(&self) -> Option { compile::build_cpp_compilation(&self.cli_options) diff --git a/crates/intrinsic-test/src/arm/types.rs b/crates/intrinsic-test/src/arm/types.rs index e86a2c5189..4be8d1e48b 100644 --- a/crates/intrinsic-test/src/arm/types.rs +++ b/crates/intrinsic-test/src/arm/types.rs @@ -14,10 +14,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { (None, None) => format!("{const_prefix}{prefix}{bit_len}_t"), (Some(simd), None) => format!("{prefix}{bit_len}x{simd}_t"), (Some(simd), Some(vec)) => format!("{prefix}{bit_len}x{simd}x{vec}_t"), - (None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case + (None, Some(_)) => todo!("{self:#?}"), // Likely an invalid case } } else { - todo!("{:#?}", self) + todo!("{self:#?}") } } @@ -58,14 +58,14 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { // The ACLE doesn't support 64-bit polynomial loads on Armv7 // if armv7 and bl == 64, use "s", else "p" TypeKind::Poly => if choose_workaround && *bl == 64 {"s"} else {"p"}, - x => todo!("get_load_function TypeKind: {:#?}", x), + x => todo!("get_load_function TypeKind: {x:#?}"), }, size = bl, quad = quad, len = vec_len.unwrap_or(1), ) } else { - todo!("get_load_function IntrinsicType: {:#?}", self) + todo!("get_load_function IntrinsicType: {self:#?}") } } @@ -90,13 +90,13 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { TypeKind::Int(Sign::Signed) => "s", TypeKind::Float => "f", TypeKind::Poly => "p", - x => todo!("get_load_function TypeKind: {:#?}", x), + x => todo!("get_load_function TypeKind: {x:#?}"), }, size = bl, quad = quad, ) } else { - todo!("get_lane_function IntrinsicType: {:#?}", self) + todo!("get_lane_function IntrinsicType: {self:#?}") } } @@ -112,12 +112,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { ty = self.c_single_vector_type(), lanes = (0..self.num_lanes()) .map(move |idx| -> std::string::String { + let lane_fn = self.get_lane_function(); + let final_cast = self.generate_final_type_cast(); format!( - "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", - cast = self.c_promotion(), - lane_fn = self.get_lane_function(), - lane = idx, - vector = vector, + "{final_cast}{lane_fn}(__return_value.val[{vector}], {idx})" ) }) .collect::>() @@ -129,12 +127,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { } else if self.num_lanes() > 1 { (0..self.num_lanes()) .map(|idx| -> std::string::String { - format!( - "{cast}{lane_fn}(__return_value, {lane})", - cast = self.c_promotion(), - lane_fn = self.get_lane_function(), - lane = idx - ) + let lane_fn = self.get_lane_function(); + let final_cast = self.generate_final_type_cast(); + format!("{final_cast}{lane_fn}(__return_value, {idx})") }) .collect::>() .join(r#" << ", " << "#) @@ -148,9 +143,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { TypeKind::Int(Sign::Signed) => format!("int{}_t", self.inner_size()), TypeKind::Int(Sign::Unsigned) => format!("uint{}_t", self.inner_size()), TypeKind::Poly => format!("poly{}_t", self.inner_size()), - ty => todo!("print_result_c - Unknown type: {:#?}", ty), + ty => todo!("print_result_c - Unknown type: {ty:#?}"), }, - promote = self.c_promotion(), + promote = self.generate_final_type_cast(), ) }; diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index f38515e40a..5fb7d0f210 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -30,7 +30,12 @@ where } pub fn to_c_type(&self) -> String { - self.ty.c_type() + let prefix = if self.ty.constant { "const " } else { "" }; + format!("{prefix}{}", self.ty.c_type()) + } + + pub fn generate_name(&self) -> String { + format!("{}_val", self.name) } pub fn is_simd(&self) -> bool { @@ -64,7 +69,7 @@ where } fn as_call_param_c(&self) -> String { - self.ty.as_call_param_c(&self.name) + self.ty.as_call_param_c(&self.generate_name()) } } @@ -91,7 +96,7 @@ where pub fn as_call_param_rust(&self) -> String { self.iter() .filter(|a| !a.has_constraint()) - .map(|arg| arg.name.clone()) + .map(|arg| arg.generate_name() + " as _") .collect::>() .join(", ") } @@ -106,11 +111,13 @@ where loads: u32, ) -> std::io::Result<()> { for arg in self.iter().filter(|&arg| !arg.has_constraint()) { + // Setting the variables on an aligned boundary to make it easier to pick + // functions (of a specific architecture) that would help load the values. writeln!( w, - "{indentation}const {ty} {name}_vals[] = {values};", + "{indentation}alignas(64) const {ty} {name}_vals[] = {values};", ty = arg.ty.c_scalar_type(), - name = arg.name, + name = arg.generate_name(), values = arg.ty.populate_random(indentation, loads, &Language::C) )? } @@ -153,7 +160,7 @@ where format!( "{indentation}{ty} {name} = cast<{ty}>({load}(&{name}_vals[i]));\n", ty = arg.to_c_type(), - name = arg.name, + name = arg.generate_name(), load = if arg.is_simd() { arg.ty.get_load_function(Language::C) } else { @@ -171,15 +178,16 @@ where self.iter() .filter(|&arg| !arg.has_constraint()) .map(|arg| { + let load = if arg.is_simd() { + arg.ty.get_load_function(Language::Rust) + } else { + "*".to_string() + }; + let typecast = if load.len() > 2 { "as _" } else { "" }; format!( - "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n", - name = arg.name, + "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i){typecast});\n", + name = arg.generate_name(), vals_name = arg.rust_vals_array_name(), - load = if arg.is_simd() { - arg.ty.get_load_function(Language::Rust) - } else { - "*".to_string() - }, ) }) .collect() diff --git a/crates/intrinsic-test/src/common/cli.rs b/crates/intrinsic-test/src/common/cli.rs index beae6a4b04..461ab542ea 100644 --- a/crates/intrinsic-test/src/common/cli.rs +++ b/crates/intrinsic-test/src/common/cli.rs @@ -54,6 +54,9 @@ pub struct Cli { /// Set the sysroot for the C++ compiler #[arg(long)] pub cxx_toolchain_dir: Option, + + #[arg(long, default_value_t = 100u8)] + pub sample_percentage: u8, } pub struct ProcessedCli { @@ -65,6 +68,7 @@ pub struct ProcessedCli { pub linker: Option, pub cxx_toolchain_dir: Option, pub skip: Vec, + pub sample_percentage: u8, } impl ProcessedCli { @@ -74,6 +78,7 @@ impl ProcessedCli { let target = cli_options.target; let linker = cli_options.linker; let cxx_toolchain_dir = cli_options.cxx_toolchain_dir; + let sample_percentage = cli_options.sample_percentage; let skip = if let Some(filename) = cli_options.skip { let data = std::fs::read_to_string(&filename).expect("Failed to open file"); @@ -108,6 +113,7 @@ impl ProcessedCli { cxx_toolchain_dir, skip, filename, + sample_percentage, } } } diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs index 1ad00839ef..902df94283 100644 --- a/crates/intrinsic-test/src/common/compare.rs +++ b/crates/intrinsic-test/src/common/compare.rs @@ -14,15 +14,14 @@ pub fn compare_outputs(intrinsic_name_list: &Vec, runner: &str, target: let intrinsics = intrinsic_name_list .par_iter() .filter_map(|intrinsic_name| { - let c = runner_command(runner) - .arg("intrinsic-test-programs") + .arg("./intrinsic-test-programs") .arg(intrinsic_name) .current_dir("c_programs") .output(); let rust = runner_command(runner) - .arg(format!("target/{target}/release/intrinsic-test-programs")) + .arg(format!("./target/{target}/release/intrinsic-test-programs")) .arg(intrinsic_name) .current_dir("rust_programs") .output(); diff --git a/crates/intrinsic-test/src/common/compile_c.rs b/crates/intrinsic-test/src/common/compile_c.rs index 258e418165..fa78b332a7 100644 --- a/crates/intrinsic-test/src/common/compile_c.rs +++ b/crates/intrinsic-test/src/common/compile_c.rs @@ -119,7 +119,7 @@ impl CppCompilation { output: &str, ) -> std::io::Result { let mut cmd = clone_command(&self.0); - cmd.args([input, "-c", "-o", output]); + cmd.args([input, "-v", "-c", "-o", output]); cmd.output() } diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index 28902b3dfe..04741e4f80 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -6,6 +6,15 @@ use super::intrinsic_helpers::IntrinsicTypeDefinition; // The number of times each intrinsic will be called. const PASSES: u32 = 20; +const COMMON_HEADERS: [&str; 7] = [ + "iostream", + "string", + "cstring", + "iomanip", + "sstream", + "type_traits", + "cassert", +]; pub fn generate_c_test_loop( w: &mut impl std::io::Write, @@ -47,7 +56,15 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>( let ty = current.ty.c_type(); writeln!(w, "{indentation}{{")?; - writeln!(w, "{body_indentation}{ty} {} = {i};", current.name)?; + + // TODO: Move to actually specifying the enum value + // instead of typecasting integers, for better clarity + // of generated code. + writeln!( + w, + "{body_indentation}const {ty} {} = ({ty}){i};", + current.generate_name() + )?; generate_c_constraint_blocks( w, @@ -99,32 +116,10 @@ pub fn write_mod_cpp( ) -> std::io::Result<()> { write!(w, "{notice}")?; - for header in platform_headers { + for header in COMMON_HEADERS.iter().chain(platform_headers.iter()) { writeln!(w, "#include <{header}>")?; } - writeln!( - w, - r#" -#include -#include -#include -#include - -template T1 cast(T2 x) {{ - static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); - T1 ret{{}}; - memcpy(&ret, &x, sizeof(T1)); - return ret; -}} - -std::ostream& operator<<(std::ostream& os, float16_t value); - - - -"# - )?; - writeln!(w, "{}", forward_declarations)?; for intrinsic in intrinsics { @@ -137,33 +132,13 @@ std::ostream& operator<<(std::ostream& os, float16_t value); pub fn write_main_cpp<'a>( w: &mut impl std::io::Write, arch_specific_definitions: &str, + arch_specific_headers: &[&str], intrinsics: impl Iterator + Clone, ) -> std::io::Result<()> { - writeln!(w, "#include ")?; - writeln!(w, "#include ")?; - - for header in ["arm_neon.h", "arm_acle.h", "arm_fp16.h"] { + for header in COMMON_HEADERS.iter().chain(arch_specific_headers.iter()) { writeln!(w, "#include <{header}>")?; } - writeln!( - w, - r#" -#include -#include -#include - -std::ostream& operator<<(std::ostream& os, float16_t value) {{ - uint16_t temp = 0; - memcpy(&temp, &value, sizeof(float16_t)); - std::stringstream ss; - ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; - os << ss.str(); - return os; -}} -"# - )?; - // NOTE: It's assumed that this value contains the required `ifdef`s. writeln!(w, "{arch_specific_definitions }")?; diff --git a/crates/intrinsic-test/src/common/gen_rust.rs b/crates/intrinsic-test/src/common/gen_rust.rs index d659cbc4aa..27f49a37b1 100644 --- a/crates/intrinsic-test/src/common/gen_rust.rs +++ b/crates/intrinsic-test/src/common/gen_rust.rs @@ -4,7 +4,6 @@ use std::process::Command; use crate::common::intrinsic::Intrinsic; use super::indentation::Indentation; -use super::intrinsic::format_f16_return_value; use super::intrinsic_helpers::IntrinsicTypeDefinition; // The number of times each intrinsic will be called. @@ -191,7 +190,7 @@ pub fn generate_rust_test_loop( w: &mut impl std::io::Write, intrinsic: &Intrinsic, indentation: Indentation, - specializations: &[Vec], + specializations: &[Vec], passes: u32, ) -> std::io::Result<()> { let intrinsic_name = &intrinsic.name; @@ -233,7 +232,6 @@ pub fn generate_rust_test_loop( } } - let return_value = format_f16_return_value(intrinsic); let indentation2 = indentation.nested(); let indentation3 = indentation2.nested(); writeln!( @@ -250,13 +248,14 @@ pub fn generate_rust_test_loop( }}", loaded_args = intrinsic.arguments.load_values_rust(indentation3), args = intrinsic.arguments.as_call_param_rust(), + return_value = intrinsic.results.print_result_rust(), ) } /// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic. fn generate_rust_specializations( constraints: &mut impl Iterator>, -) -> Vec> { +) -> Vec> { let mut specializations = vec![vec![]]; for constraint in constraints { @@ -264,7 +263,7 @@ fn generate_rust_specializations( .flat_map(|right| { specializations.iter().map(move |left| { let mut left = left.clone(); - left.push(u8::try_from(right).unwrap()); + left.push(i32::try_from(right).unwrap()); left }) }) diff --git a/crates/intrinsic-test/src/common/intrinsic.rs b/crates/intrinsic-test/src/common/intrinsic.rs index 95276d19b7..81f6d6d8b5 100644 --- a/crates/intrinsic-test/src/common/intrinsic.rs +++ b/crates/intrinsic-test/src/common/intrinsic.rs @@ -1,5 +1,5 @@ use super::argument::ArgumentList; -use super::intrinsic_helpers::{IntrinsicTypeDefinition, TypeKind}; +use super::intrinsic_helpers::IntrinsicTypeDefinition; /// An intrinsic #[derive(Debug, PartialEq, Clone)] @@ -16,17 +16,3 @@ pub struct Intrinsic { /// Any architecture-specific tags. pub arch_tags: Vec, } - -pub fn format_f16_return_value(intrinsic: &Intrinsic) -> String { - // the `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses - // a string representation of the output value to compare. In C, f16 values are currently printed - // as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print - // them as decimal floating point values. To keep the intrinsics tests working, for now, format - // vectors containing f16 values like C prints them. - let return_value = match intrinsic.results.kind() { - TypeKind::Float if intrinsic.results.inner_size() == 16 => "debug_f16(__return_value)", - _ => "format_args!(\"{__return_value:.150?}\")", - }; - - String::from(return_value) -} diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 7bc1015a38..c2d66868ce 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::fmt; use std::ops::Deref; use std::str::FromStr; @@ -75,9 +76,11 @@ impl TypeKind { Self::Float => "float", Self::Int(Sign::Signed) => "int", Self::Int(Sign::Unsigned) => "uint", + Self::Mask => "uint", Self::Poly => "poly", Self::Char(Sign::Signed) => "char", - _ => unreachable!("Not used: {:#?}", self), + Self::Vector => "int", + _ => unreachable!("Not used: {self:#?}"), } } @@ -91,7 +94,7 @@ impl TypeKind { Self::Poly => "u", Self::Char(Sign::Unsigned) => "u", Self::Char(Sign::Signed) => "i", - _ => unreachable!("Unused type kind: {:#?}", self), + _ => unreachable!("Unused type kind: {self:#?}"), } } } @@ -129,9 +132,9 @@ impl IntrinsicType { pub fn inner_size(&self) -> u32 { if let Some(bl) = self.bit_len { - bl + cmp::max(bl, 8) } else { - unreachable!("") + unreachable!("{self:#?}") } } @@ -154,6 +157,7 @@ impl IntrinsicType { pub fn c_scalar_type(&self) -> String { match self.kind() { TypeKind::Char(_) => String::from("char"), + TypeKind::Vector => String::from("int32_t"), _ => format!( "{prefix}{bits}_t", prefix = self.kind().c_prefix(), @@ -162,14 +166,6 @@ impl IntrinsicType { } } - pub fn rust_scalar_type(&self) -> String { - format!( - "{prefix}{bits}", - prefix = self.kind().rust_prefix(), - bits = self.inner_size() - ) - } - pub fn c_promotion(&self) -> &str { match *self { IntrinsicType { @@ -177,9 +173,9 @@ impl IntrinsicType { bit_len: Some(8), .. } => match kind { - TypeKind::Int(Sign::Signed) => "(int)", - TypeKind::Int(Sign::Unsigned) => "(unsigned int)", - TypeKind::Poly => "(unsigned int)(uint8_t)", + TypeKind::Int(Sign::Signed) => "int", + TypeKind::Int(Sign::Unsigned) => "unsigned int", + TypeKind::Poly => "uint8_t", _ => "", }, IntrinsicType { @@ -188,9 +184,9 @@ impl IntrinsicType { .. } => match bit_len { 8 => unreachable!("handled above"), - 16 => "(uint16_t)", - 32 => "(uint32_t)", - 64 => "(uint64_t)", + 16 => "uint16_t", + 32 => "uint32_t", + 64 => "uint64_t", 128 => "", _ => panic!("invalid bit_len"), }, @@ -199,16 +195,16 @@ impl IntrinsicType { bit_len: Some(bit_len), .. } => match bit_len { - 16 => "(float16_t)", - 32 => "(float)", - 64 => "(double)", + 16 => "float16_t", + 32 => "float", + 64 => "double", 128 => "", _ => panic!("invalid bit_len"), }, IntrinsicType { kind: TypeKind::Char(_), .. - } => "(char)", + } => "char", _ => "", } } @@ -221,15 +217,16 @@ impl IntrinsicType { ) -> String { match self { IntrinsicType { - bit_len: Some(bit_len @ (8 | 16 | 32 | 64)), - kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_)), + bit_len: Some(bit_len @ (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 16 | 32 | 64)), + kind: + kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask), simd_len, vec_len, .. } => { let (prefix, suffix) = match language { - Language::Rust => ("[", "]"), - Language::C => ("{", "}"), + Language::Rust => ('[', ']'), + Language::C => ('{', '}'), }; let body_indentation = indentation.nested(); format!( @@ -265,12 +262,12 @@ impl IntrinsicType { .. } => { let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) { - (&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"), - (&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"), - (&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"), - (&Language::C, 16) => ("{", "cast(", ")", "}"), - (&Language::C, 32) => ("{", "cast(", ")", "}"), - (&Language::C, 64) => ("{", "cast(", ")", "}"), + (&Language::Rust, 16) => ('[', "f16::from_bits(", ")", ']'), + (&Language::Rust, 32) => ('[', "f32::from_bits(", ")", ']'), + (&Language::Rust, 64) => ('[', "f64::from_bits(", ")", ']'), + (&Language::C, 16) => ('{', "cast(", ")", '}'), + (&Language::C, 32) => ('{', "cast(", ")", '}'), + (&Language::C, 64) => ('{', "cast(", ")", '}'), _ => unreachable!(), }; format!( @@ -283,7 +280,44 @@ impl IntrinsicType { ))) ) } - _ => unimplemented!("populate random: {:#?}", self), + IntrinsicType { + kind: TypeKind::Vector, + bit_len: Some(128 | 256 | 512), + simd_len, + vec_len, + .. + } => { + let (prefix, suffix) = match language { + Language::Rust => ('[', ']'), + Language::C => ('{', '}'), + }; + let body_indentation = indentation.nested(); + let effective_bit_len = 32; + format!( + "{prefix}\n{body}\n{indentation}{suffix}", + body = (0..(vec_len.unwrap_or(1) * simd_len.unwrap_or(1) + loads - 1)) + .format_with(",\n", |i, fmt| { + let src = value_for_array(effective_bit_len, i); + assert!(src == 0 || src.ilog2() < effective_bit_len); + if (src >> (effective_bit_len - 1)) != 0 { + // `src` is a two's complement representation of a negative value. + let mask = !0u64 >> (64 - effective_bit_len); + let ones_compl = src ^ mask; + let twos_compl = ones_compl + 1; + if (twos_compl == src) && (language == &Language::C) { + // `src` is INT*_MIN. C requires `-0x7fffffff - 1` to avoid + // undefined literal overflow behaviour. + fmt(&format_args!("{body_indentation}-{ones_compl:#x} - 1")) + } else { + fmt(&format_args!("{body_indentation}-{twos_compl:#x}")) + } + } else { + fmt(&format_args!("{body_indentation}{src:#x}")) + } + }) + ) + } + _ => unimplemented!("populate random: {self:#?}"), } } @@ -298,7 +332,7 @@ impl IntrinsicType { kind: TypeKind::Int(_) | TypeKind::Poly, .. } => true, - _ => unimplemented!(), + _ => true, } } @@ -330,4 +364,40 @@ pub trait IntrinsicTypeDefinition: Deref { /// rust debug output format for the return type. The generated line assumes /// there is an int i in scope which is the current pass number. fn print_result_c(&self, indentation: Indentation, additional: &str) -> String; + + /// Generates a std::cout for the intrinsics results that will match the + /// rust debug output format for the return type. The generated line assumes + /// there is an int i in scope which is the current pass number. + /// + /// The `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses + /// a string representation of the output value to compare. In C, f16 values are currently printed + /// as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print + /// them as decimal floating point values. To keep the intrinsics tests working, for now, format + /// vectors containing f16 values like C prints them. + fn print_result_rust(&self) -> String { + let return_value = match self.kind() { + TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)", + _ => "format_args!(\"{__return_value:.150?}\")", + }; + + String::from(return_value) + } + + /// To enable architecture-specific logic + fn rust_scalar_type(&self) -> String { + format!( + "{prefix}{bits}", + prefix = self.kind().rust_prefix(), + bits = self.inner_size() + ) + } + + fn generate_final_type_cast(&self) -> String { + let type_data = self.c_promotion(); + if type_data.len() > 2 { + format!("({type_data})") + } else { + String::new() + } + } } diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 666b3885c1..d8f06ae238 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -49,7 +49,7 @@ pub trait SupportedArchitectureTest { fn cpp_compilation(&self) -> Option; fn build_c_file(&self) -> bool { - let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len()); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400); let cpp_compiler_wrapped = self.cpp_compilation(); @@ -60,34 +60,42 @@ pub trait SupportedArchitectureTest { .map(|(i, chunk)| { let c_filename = format!("c_programs/mod_{i}.cpp"); let mut file = File::create(&c_filename).unwrap(); - write_mod_cpp( + let mod_file_write_result = write_mod_cpp( &mut file, Self::NOTICE, Self::PLATFORM_C_HEADERS, Self::PLATFORM_C_FORWARD_DECLARATIONS, chunk, - ) - .unwrap(); + ); + + if let Err(error) = mod_file_write_result { + return Err(format!("Error writing to mod_{i}.cpp: {error:?}")); + } // compile this cpp file into a .o file. // // This is done because `cpp_compiler_wrapped` is None when // the --generate-only flag is passed + trace!("compiling mod_{i}.cpp"); if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { - let output = cpp_compiler - .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; - assert!(output.status.success(), "{output:?}"); - } + let compile_output = cpp_compiler + .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")); + trace!("finished compiling mod_{i}.cpp"); + if let Err(compile_error) = compile_output { + return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}")); + } + } Ok(()) }) - .collect::>() + .collect::>() .unwrap(); let mut file = File::create("c_programs/main.cpp").unwrap(); write_main_cpp( &mut file, Self::PLATFORM_C_DEFINITIONS, + Self::PLATFORM_C_HEADERS, self.intrinsics().iter().map(|i| i.name.as_str()), ) .unwrap(); @@ -96,7 +104,7 @@ pub trait SupportedArchitectureTest { // the --generate-only flag is passed if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { // compile this cpp file into a .o file - info!("compiling main.cpp"); + trace!("compiling main.cpp"); let output = cpp_compiler .compile_object_file("main.cpp", "intrinsic-test-programs.o") .unwrap(); @@ -118,7 +126,7 @@ pub trait SupportedArchitectureTest { fn build_rust_file(&self) -> bool { std::fs::create_dir_all("rust_programs/src").unwrap(); - let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len()); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400); let mut cargo = File::create("rust_programs/Cargo.toml").unwrap(); write_bin_cargo_toml(&mut cargo, chunk_count).unwrap(); @@ -188,9 +196,13 @@ pub trait SupportedArchitectureTest { } } -pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) { - let available_parallelism = std::thread::available_parallelism().unwrap().get(); - let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count)); +// pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) { +// let available_parallelism = std::thread::available_parallelism().unwrap().get(); +// let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count)); + +// (chunk_size, intrinsic_count.div_ceil(chunk_size)) +// } +pub fn manual_chunk(intrinsic_count: usize, chunk_size: usize) -> (usize, usize) { (chunk_size, intrinsic_count.div_ceil(chunk_size)) } diff --git a/crates/intrinsic-test/src/common/values.rs b/crates/intrinsic-test/src/common/values.rs index 1b614a742e..6c94ef2c22 100644 --- a/crates/intrinsic-test/src/common/values.rs +++ b/crates/intrinsic-test/src/common/values.rs @@ -4,6 +4,13 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 { let index = index as usize; match bits { + 1 => VALUES_8[index % 2].into(), + 2 => VALUES_8[index % 4].into(), + 3 => VALUES_8[index % 8].into(), + 4 => VALUES_8[index % 16].into(), + 5 => VALUES_5[index % VALUES_5.len()].into(), + 6 => VALUES_6[index % VALUES_6.len()].into(), + 7 => VALUES_7[index % VALUES_7.len()].into(), 8 => VALUES_8[index % VALUES_8.len()].into(), 16 => VALUES_16[index % VALUES_16.len()].into(), 32 => VALUES_32[index % VALUES_32.len()].into(), @@ -12,6 +19,24 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 { } } +pub const VALUES_5: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x019, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, + 0x1f, +]; + +pub const VALUES_6: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x039, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, + 0x3f, +]; + +pub const VALUES_7: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x079, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, + 0x7f, +]; + pub const VALUES_8: &[u8] = &[ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xf0, 0x80, 0x3b, 0xff, diff --git a/crates/intrinsic-test/src/main.rs b/crates/intrinsic-test/src/main.rs index 44d7aafd82..ed3a50067d 100644 --- a/crates/intrinsic-test/src/main.rs +++ b/crates/intrinsic-test/src/main.rs @@ -3,10 +3,12 @@ extern crate log; mod arm; mod common; +mod x86; use arm::ArmArchitectureTest; use common::SupportedArchitectureTest; use common::cli::{Cli, ProcessedCli}; +use x86::X86ArchitectureTest; fn main() { pretty_env_logger::init(); @@ -18,6 +20,7 @@ fn main() { | "armv7-unknown-linux-gnueabihf" | "aarch64_be-unknown-linux-gnu" => run(ArmArchitectureTest::create(processed_cli_options)), + "x86_64-unknown-linux-gnu" => run(X86ArchitectureTest::create(processed_cli_options)), _ => std::process::exit(0), } } diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs new file mode 100644 index 0000000000..60997a1278 --- /dev/null +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -0,0 +1,47 @@ +use crate::common::cli::ProcessedCli; +use crate::common::compile_c::{CompilationCommandBuilder, CppCompilation}; + +pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { + let cpp_compiler = config.cpp_compiler.as_ref()?; + + // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations + let mut command = CompilationCommandBuilder::new() + .add_arch_flags(["icelake-client"]) + .set_compiler(cpp_compiler) + .set_target(&config.target) + .set_opt_level("2") + .set_cxx_toolchain_dir(config.cxx_toolchain_dir.as_deref()) + .set_project_root("c_programs") + .add_extra_flags(vec![ + "-ffp-contract=off", + "-Wno-narrowing", + "-mavx", + "-mavx2", + "-mavx512f", + "-msse2", + "-mavx512vl", + "-mavx512bw", + "-mavx512dq", + "-mavx512cd", + "-mavx512fp16", + "-msha512", + "-msm4", + "-mavxvnni", + "-mavx512bitalg", + "-mavx512ifma", + "-mavx512vbmi", + "-mavx512vbmi2", + "-mavx512vnni", + "-mavx512vpopcntdq", + "-ferror-limit=1000", + "-std=c++23", + ]); + + if !cpp_compiler.contains("clang") { + command = command.add_extra_flag("-flax-vector-conversions"); + } + + let cpp_compiler = command.into_cpp_compilation(); + + Some(cpp_compiler) +} diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs new file mode 100644 index 0000000000..7c349e4482 --- /dev/null +++ b/crates/intrinsic-test/src/x86/config.rs @@ -0,0 +1,409 @@ +pub const NOTICE: &str = "\ +// This is a transient test file, not intended for distribution. Some aspects of the +// test are derived from an XML specification, published under the same license as the +// `intrinsic-test` crate.\n"; + +// Format f16 values (and vectors containing them) in a way that is consistent with C. +pub const PLATFORM_RUST_DEFINITIONS: &str = r#" +use std::arch::x86_64::*; + +#[inline] +unsafe fn _mm_loadu_ph_to___m128i(mem_addr: *const f16) -> __m128i { + _mm_castph_si128(_mm_loadu_ph(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_ph_to___m256i(mem_addr: *const f16) -> __m256i { + _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_ph_to___mm512i(mem_addr: *const f16) -> __m512i { + _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) +} + + +#[inline] +unsafe fn _mm_loadu_ps_to___m128h(mem_addr: *const f32) -> __m128h { + _mm_castps_ph(_mm_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_ps_to___m256h(mem_addr: *const f32) -> __m256h { + _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_ps_to___m512h(mem_addr: *const f32) -> __m512h { + _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi16_to___m128d(mem_addr: *const i16) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi16_to___m256d(mem_addr: *const i16) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi16_to___m512d(mem_addr: *const i16) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi32_to___m128d(mem_addr: *const i32) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi32_to___m256d(mem_addr: *const i32) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi32_to___m512d(mem_addr: *const i32) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi64_to___m128d(mem_addr: *const i64) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi64_to___m256d(mem_addr: *const i64) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi64_to___m512d(mem_addr: *const i64) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi64(mem_addr)) +} + +// === +#[inline] +unsafe fn _mm_loadu_epi16_to___m128(mem_addr: *const i16) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi16_to___m256(mem_addr: *const i16) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi16_to___m512(mem_addr: *const i16) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi32_to___m128(mem_addr: *const i32) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi32_to___m256(mem_addr: *const i32) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi32_to___m512(mem_addr: *const i32) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi64_to___m128(mem_addr: *const i64) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi64_to___m256(mem_addr: *const i64) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi64_to___m512(mem_addr: *const i64) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi64(mem_addr)) +} + +#[inline] +fn debug_simd_finish( + formatter: &mut core::fmt::Formatter<'_>, + type_name: &str, + array: &[T; N], +) -> core::fmt::Result { + core::fmt::Formatter::debug_tuple_fields_finish( + formatter, + type_name, + &core::array::from_fn::<&dyn core::fmt::Debug, N, _>(|i| &array[i]), + ) +} + +#[repr(transparent)] +struct Hex(T); + +impl core::fmt::Debug for Hex { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + ::fmt(&self.0, f) + } +} + +fn debug_f16(x: T) -> impl core::fmt::Debug { + Hex(x) +} + +trait DebugHexF16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result; +} + +impl DebugHexF16 for f16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{:#06x?}", self.to_bits()) + } +} + +impl DebugHexF16 for __m128h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 8]>(*self) }; + debug_simd_finish(f, "__m128h", &array) + } +} + +impl DebugHexF16 for __m128i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 8]>(*self) }; + debug_simd_finish(f, "__m128i", &array) + } +} + +impl DebugHexF16 for __m256h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 16]>(*self) }; + debug_simd_finish(f, "__m256h", &array) + } +} + +impl DebugHexF16 for __m256i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 16]>(*self) }; + debug_simd_finish(f, "__m256i", &array) + } +} + +impl DebugHexF16 for __m512h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 32]>(*self) }; + debug_simd_finish(f, "__m512h", &array) + } +} + +impl DebugHexF16 for __m512i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 32]>(*self) }; + debug_simd_finish(f, "__m512i", &array) + } +} + +trait DebugAs { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result; +} + +impl DebugAs for T { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{self}") + } +} + +macro_rules! impl_debug_as { + ($simd:ty, $name:expr, $bits:expr, [$($type:ty),+]) => { + $( + impl DebugAs<$type> for $simd { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + const ELEMENT_BITS: usize = core::mem::size_of::<$type>() * 8; + const NUM_ELEMENTS: usize = $bits / ELEMENT_BITS; + let array = unsafe { core::mem::transmute::<_, [$type; NUM_ELEMENTS]>(*self) }; + debug_simd_finish(f, $name, &array) + } + } + )+ + }; +} + +impl_debug_as!(__m128i, "__m128i", 128, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m256i, "__m256i", 256, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m512i, "__m512i", 512, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m128h, "__m128h", 128, [f32]); +impl_debug_as!(__m256h, "__m256h", 256, [f32]); +impl_debug_as!(__m512h, "__m512h", 512, [f32]); + +fn debug_as(x: V) -> impl core::fmt::Debug +where V: DebugAs +{ + struct DebugWrapper(V, core::marker::PhantomData); + impl, T> core::fmt::Debug for DebugWrapper { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.0.fmt(f) + } + } + DebugWrapper(x, core::marker::PhantomData) +} + +"#; + +pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" +#ifndef X86_DECLARATIONS +#define X86_DECLARATIONS + typedef _Float16 float16_t; + typedef float float32_t; + typedef double float64_t; + + #define __int64 long long + #define __int32 int + + std::ostream& operator<<(std::ostream& os, _Float16 value); + std::ostream& operator<<(std::ostream& os, __m128i value); + std::ostream& operator<<(std::ostream& os, __m256i value); + std::ostream& operator<<(std::ostream& os, __m512i value); + std::ostream& operator<<(std::ostream& os, __mmask8 value); + + #define _mm512_extract_intrinsic_test_epi8(m, lane) \ + _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) + + #define _mm512_extract_intrinsic_test_epi16(m, lane) \ + _mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8) + + #define _mm512_extract_intrinsic_test_epi32(m, lane) \ + _mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4) + + #define _mm512_extract_intrinsic_test_epi64(m, lane) \ + _mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2) + + #define _mm64_extract_intrinsic_test_epi8(m, lane) \ + ((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF) + + #define _mm64_extract_intrinsic_test_epi32(m, lane) \ + _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32)) + + // Load f16 (__m128h) and cast to integer (__m128i) + #define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr)) + #define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) + #define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) + + // Load f32 (__m128) and cast to f16 (__m128h) + #define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr)) + #define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) + #define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) + + // Load integer types and cast to double (__m128d, __m256d, __m512d) + #define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + // Load integer types and cast to float (__m128, __m256, __m512) + #define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + // T1 is the `To` type, T2 is the `From` type + template T1 cast(T2 x) { + if constexpr ((std::is_integral_v && std::is_integral_v) || (std::is_floating_point_v && std::is_floating_point_v)) { + return x; + } else if constexpr (sizeof(T1) <= sizeof(T2)) { + T1 ret{}; + std::memcpy(&ret, &x, sizeof(T1)); + return ret; + } else { + static_assert(sizeof(T1) == sizeof(T2) || std::is_convertible_v, + "T2 must either be convertible to T1, or have the same size as T1!"); + return T1{}; + } + } +#endif +"#; +pub const PLATFORM_C_DEFINITIONS: &str = r#" + +std::ostream& operator<<(std::ostream& os, _Float16 value) { + uint16_t temp = 0; + memcpy(&temp, &value, sizeof(_Float16)); + std::stringstream ss; + ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __m128i value) { + void* temp = malloc(sizeof(__m128i)); + _mm_storeu_si128((__m128i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 16; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __m256i value) { + void* temp = malloc(sizeof(__m256i)); + _mm256_storeu_si256((__m256i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 32; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __m512i value) { + void* temp = malloc(sizeof(__m512i)); + _mm512_storeu_si512((__m512i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 64; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __mmask8 value) { + os << static_cast(value); + return os; +} +"#; + +pub const PLATFORM_RUST_CFGS: &str = r#" +#![cfg_attr(target_arch = "x86", feature(avx))] +#![cfg_attr(target_arch = "x86", feature(sse))] +#![cfg_attr(target_arch = "x86", feature(sse2))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] +#![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))] +#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))] +#![feature(fmt_helpers_for_derive)] +"#; diff --git a/crates/intrinsic-test/src/x86/constraint.rs b/crates/intrinsic-test/src/x86/constraint.rs new file mode 100644 index 0000000000..72f5da3b3f --- /dev/null +++ b/crates/intrinsic-test/src/x86/constraint.rs @@ -0,0 +1,30 @@ +use crate::common::constraint::Constraint; + +pub fn map_constraints(imm_type: &String, imm_width: u32) -> Option { + if imm_width > 0 { + let max: i64 = 2i64.pow(imm_width); + return Some(Constraint::Range(0..max)); + } + match imm_type.as_str() { + // Legal values for variables of `_MM_FROUND` type are: + // 8 => (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions + // 9 => (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions + // 10 => (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions + // 11 => (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions + // 4 => _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE + "_MM_FROUND" => Some(Constraint::Set(vec![4, 8, 9, 10, 11])), + "_MM_INDEX_SCALE" => Some(Constraint::Set(vec![1, 2, 4, 8])), + "_MM_CMPINT" => Some(Constraint::Range(0..8)), + "_MM_REDUCE" => Some(Constraint::Range(0..8)), + "_MM_FROUND_SAE" => Some(Constraint::Equal(8)), + "_MM_MANTISSA_NORM" => Some(Constraint::Range(0..4)), + "_MM_MANTISSA_NORM_ENUM" => Some(Constraint::Range(0..4)), + "_MM_MANTISSA_SIGN" => Some(Constraint::Range(0..3)), + "_MM_PERM" => Some(Constraint::Range(0..256)), + "_MM_PERM_ENUM" => Some(Constraint::Range(0..256)), + "_MM_CMPINT_ENUM" => Some(Constraint::Range(0..8)), + "_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2, 0x4, 0x6])), + "_CMP_" => Some(Constraint::Range(0..32)), + _ => None, + } +} diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs new file mode 100644 index 0000000000..1417c51ea1 --- /dev/null +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -0,0 +1,23 @@ +use crate::common::intrinsic_helpers::IntrinsicType; +use crate::x86::xml_parser::Parameter; +use std::ops::{Deref, DerefMut}; + +#[derive(Debug, Clone, PartialEq)] +pub struct X86IntrinsicType { + pub data: IntrinsicType, + pub param: Parameter, +} + +impl Deref for X86IntrinsicType { + type Target = IntrinsicType; + + fn deref(&self) -> &Self::Target { + &self.data + } +} + +impl DerefMut for X86IntrinsicType { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.data + } +} diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs new file mode 100644 index 0000000000..956e51836f --- /dev/null +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -0,0 +1,76 @@ +mod compile; +mod config; +mod constraint; +mod intrinsic; +mod types; +mod xml_parser; + +use crate::common::SupportedArchitectureTest; +use crate::common::cli::ProcessedCli; +use crate::common::compile_c::CppCompilation; +use crate::common::intrinsic::Intrinsic; +use crate::common::intrinsic_helpers::TypeKind; +use intrinsic::X86IntrinsicType; +use itertools::Itertools; +use xml_parser::get_xml_intrinsics; + +pub struct X86ArchitectureTest { + intrinsics: Vec>, + cli_options: ProcessedCli, +} + +impl SupportedArchitectureTest for X86ArchitectureTest { + type IntrinsicImpl = X86IntrinsicType; + + fn cli_options(&self) -> &ProcessedCli { + &self.cli_options + } + + fn intrinsics(&self) -> &[Intrinsic] { + &self.intrinsics + } + + fn cpp_compilation(&self) -> Option { + compile::build_cpp_compilation(&self.cli_options) + } + + const NOTICE: &str = config::NOTICE; + + const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h", "cstddef", "cstdint"]; + const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS; + + const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS; + const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS; + + fn create(cli_options: ProcessedCli) -> Self { + let intrinsics = + get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); + + let sample_percentage: usize = cli_options.sample_percentage as usize; + + let mut intrinsics = intrinsics + .into_iter() + // Not sure how we would compare intrinsic that returns void. + .filter(|i| i.results.kind() != TypeKind::Void) + .filter(|i| i.results.kind() != TypeKind::BFloat) + .filter(|i| i.arguments.args.len() > 0) + .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat)) + // Skip pointers for now, we would probably need to look at the return + // type to work out how many elements we need to point to. + .filter(|i| !i.arguments.iter().any(|a| a.is_ptr())) + .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128)) + .filter(|i| !cli_options.skip.contains(&i.name)) + .unique_by(|i| i.name.clone()) + .collect::>(); + + let sample_size = (intrinsics.len() * sample_percentage) / 100; + intrinsics.truncate(sample_size); + + intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); + Self { + intrinsics: intrinsics, + cli_options: cli_options, + } + } +} diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs new file mode 100644 index 0000000000..87932fcb3e --- /dev/null +++ b/crates/intrinsic-test/src/x86/types.rs @@ -0,0 +1,485 @@ +use std::str::FromStr; + +use itertools::Itertools; +use regex::Regex; + +use super::intrinsic::X86IntrinsicType; +use crate::common::cli::Language; +use crate::common::indentation::Indentation; +use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind}; +use crate::x86::xml_parser::Parameter; + +impl IntrinsicTypeDefinition for X86IntrinsicType { + /// Gets a string containing the type in C format. + /// This function assumes that this value is present in the metadata hashmap. + fn c_type(&self) -> String { + self.param + .type_data + .replace("unsigned __int64", "uint64_t") + .replace("unsigned __int32", "uint32_t") + .replace("unsigned __int16", "uint16_t") + .replace("unsigned __int8", "uint8_t") + .replace("__int64", "int64_t") + .replace("__int32", "int32_t") + .replace("__int16", "int16_t") + .replace("__int8", "int8_t") + .replace("const ", "") + } + + fn c_single_vector_type(&self) -> String { + // matches __m128, __m256 and similar types + let re = Regex::new(r"__m\d+").unwrap(); + if re.is_match(self.param.type_data.as_str()) { + self.param.type_data.clone() + } else { + unreachable!("Shouldn't be called on this type") + } + } + + // fn rust_type(&self) -> String { + // // handling edge cases first + // // the general handling is implemented below + // if let Some(val) = self.metadata.get("type") { + // match val.as_str() { + // "__m128 const *" => { + // return "&__m128".to_string(); + // } + // "__m128d const *" => { + // return "&__m128d".to_string(); + // } + // "const void*" => { + // return "&__m128d".to_string(); + // } + // _ => {} + // } + // } + + // if self.kind() == TypeKind::Void && self.ptr { + // // this has been handled by default settings in + // // the from_param function of X86IntrinsicType + // unreachable!() + // } + + // // general handling cases + // let core_part = if self.kind() == TypeKind::Mask { + // // all types of __mmask are handled here + // format!("__mask{}", self.bit_len.unwrap()) + // } else if self.simd_len.is_some() { + // // all types of __m vector types are handled here + // let re = Regex::new(r"\__m\d+[a-z]*").unwrap(); + // let rust_type = self + // .metadata + // .get("type") + // .map(|val| re.find(val).unwrap().as_str()); + // rust_type.unwrap().to_string() + // } else { + // format!( + // "{}{}", + // self.kind.rust_prefix().to_string(), + // self.bit_len.unwrap() + // ) + // }; + + // // extracting "memsize" so that even vector types can be involved + // let memwidth = self + // .metadata + // .get("memwidth") + // .map(|n| str::parse::(n).unwrap()); + // let prefix_part = if self.ptr && self.constant && self.bit_len.eq(&memwidth) { + // "&" + // } else if self.ptr && self.bit_len.eq(&memwidth) { + // "&mut " + // } else if self.ptr && self.constant { + // "*const " + // } else if self.ptr { + // "*mut " + // } else { + // "" + // }; + + // return prefix_part.to_string() + core_part.as_str(); + // } + + /// Determines the load function for this type. + fn get_load_function(&self, _language: Language) -> String { + let type_value = self.param.type_data.clone(); + if type_value.len() == 0 { + unimplemented!("the value for key 'type' is not present!"); + } + if type_value.starts_with("__mmask") { + // no need of loads, since they work directly + // with hex constants + String::from("*") + } else if type_value.starts_with("__m") { + // the structure is like the follows: + // if "type" starts with __m{h/i/}, + // then use either _mm_set1_epi64, + // _mm256_set1_epi64 or _mm512_set1_epi64 + if type_value.contains("__m64") { + return String::from("*(__m64*)"); + } + + let type_val_filtered = type_value + .chars() + .filter(|c| c.is_numeric()) + .join("") + .replace("128", "") + .replace("64", ""); + { + let suffix = match (self.bit_len, self.kind) { + (Some(16), TypeKind::Float) + if ["__m128i", "__m256i", "__m512i"] + .contains(&self.param.type_data.as_str()) => + { + format!("ph_to_{}", self.param.type_data) + } + (Some(32), TypeKind::Float) + if ["__m128h", "__m256h", "__m512h"] + .contains(&self.param.type_data.as_str()) => + { + format!("ps_to_{}", self.param.type_data) + } + (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask) + if ["__m128d", "__m256d", "__m512d"] + .contains(&self.param.type_data.as_str()) => + { + format!("epi{bit_len}_to_{}", self.param.type_data) + } + (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask) + if ["__m128", "__m256", "__m512"] + .contains(&self.param.type_data.as_str()) => + { + format!("epi{bit_len}_to_{}", self.param.type_data) + } + (Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => { + format!("epi{bit_len}") + } + (Some(bit_len), TypeKind::Mask) => format!("epi{bit_len}"), + (Some(16), TypeKind::Float) => format!("ph"), + (Some(32), TypeKind::Float) => format!("ps"), + (Some(64), TypeKind::Float) => format!("pd"), + (Some(128 | 256 | 512), TypeKind::Vector) => format!("epi32"), + _ => unreachable!("Invalid element type for a vector type! {:?}", self.param), + }; + format!("_mm{type_val_filtered}_loadu_{suffix}") + } + } else { + // if it is a pointer, then rely on type conversion + // If it is not any of the above type (__int, __bfloat16, unsigned short, etc) + // then typecast it. + format!("({type_value})") + } + } + + /// Generates a std::cout for the intrinsics results that will match the + /// rust debug output format for the return type. The generated line assumes + /// there is an int i in scope which is the current pass number. + fn print_result_c(&self, indentation: Indentation, additional: &str) -> String { + let lanes = if self.num_vectors() > 1 { + (0..self.num_vectors()) + .map(|vector| { + format!( + r#""{ty}(" << {lanes} << ")""#, + ty = self.c_single_vector_type(), + lanes = (0..self.num_lanes()) + .map(move |idx| -> std::string::String { + format!( + "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", + cast = self.generate_final_type_cast(), + lane_fn = self.get_lane_function(), + lane = idx, + vector = vector, + ) + }) + .collect::>() + .join(r#" << ", " << "#) + ) + }) + .collect::>() + .join(r#" << ", " << "#) + } else if self.num_lanes() > 1 { + (0..self.num_lanes()) + .map(|idx| -> std::string::String { + let cast_type = self.c_promotion(); + let lane_fn = self.get_lane_function(); + if cast_type.len() > 2 { + format!("cast<{cast_type}>({lane_fn}(__return_value, {idx}))") + } else { + format!("{lane_fn}(__return_value, {idx})") + } + }) + .collect::>() + .join(r#" << ", " << "#) + } else { + format!( + "{promote}cast<{cast}>(__return_value)", + cast = match self.kind() { + TypeKind::Void => "void".to_string(), + TypeKind::Float if self.inner_size() == 64 => "double".to_string(), + TypeKind::Float if self.inner_size() == 32 => "float".to_string(), + TypeKind::Mask => format!( + "__mmask{}", + self.bit_len.expect(format!("self: {self:#?}").as_str()) + ), + TypeKind::Vector => format!( + "__m{}i", + self.bit_len.expect(format!("self: {self:#?}").as_str()) + ), + _ => self.c_scalar_type(), + }, + promote = self.generate_final_type_cast(), + ) + }; + + format!( + r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#, + ty = if self.is_simd() { + format!("{}(", self.c_type()) + } else { + String::from("") + }, + close = if self.is_simd() { ")" } else { "" }, + ) + } + + /// Determines the get lane function for this type. + fn get_lane_function(&self) -> String { + let total_vector_bits: Option = self + .simd_len + .zip(self.bit_len) + .and_then(|(simd_len, bit_len)| Some(simd_len * bit_len)); + + match (self.bit_len, total_vector_bits) { + (Some(8), Some(128)) => String::from("(uint8_t)_mm_extract_epi8"), + (Some(16), Some(128)) => String::from("(uint16_t)_mm_extract_epi16"), + (Some(32), Some(128)) => String::from("(uint32_t)_mm_extract_epi32"), + (Some(64), Some(128)) => String::from("(uint64_t)_mm_extract_epi64"), + (Some(8), Some(256)) => String::from("(uint8_t)_mm256_extract_epi8"), + (Some(16), Some(256)) => String::from("(uint16_t)_mm256_extract_epi16"), + (Some(32), Some(256)) => String::from("(uint32_t)_mm256_extract_epi32"), + (Some(64), Some(256)) => String::from("(uint64_t)_mm256_extract_epi64"), + (Some(8), Some(512)) => String::from("(uint8_t)_mm512_extract_intrinsic_test_epi8"), + (Some(16), Some(512)) => String::from("(uint16_t)_mm512_extract_intrinsic_test_epi16"), + (Some(32), Some(512)) => String::from("(uint32_t)_mm512_extract_intrinsic_test_epi32"), + (Some(64), Some(512)) => String::from("(uint64_t)_mm512_extract_intrinsic_test_epi64"), + (Some(8), Some(64)) => String::from("(uint8_t)_mm64_extract_intrinsic_test_epi8"), + (Some(16), Some(64)) => String::from("(uint16_t)_mm_extract_pi16"), + (Some(32), Some(64)) => String::from("(uint32_t)_mm64_extract_intrinsic_test_epi32"), + _ => unreachable!( + "invalid length for vector argument: {:?}, {:?}", + self.bit_len, self.simd_len + ), + } + } + + fn rust_scalar_type(&self) -> String { + let prefix = match self.data.kind { + TypeKind::Mask => String::from("__mmask"), + TypeKind::Vector => String::from("i"), + _ => self.kind().rust_prefix().to_string(), + }; + + let bits = if self.inner_size() >= 128 { + 32 + } else { + self.inner_size() + }; + format!("{prefix}{bits}") + } + + fn print_result_rust(&self) -> String { + let return_value = match self.kind() { + TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)".to_string(), + TypeKind::Float + if self.inner_size() == 32 + && ["__m512h"].contains(&self.param.type_data.as_str()) => + { + "debug_as::<_, f32>(__return_value)".to_string() + } + TypeKind::Int(_) + if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) => + { + format!("debug_as::<_, u{}>(__return_value)", self.inner_size()) + } + _ => "format_args!(\"{__return_value:.150?}\")".to_string(), + }; + + return_value + } +} + +impl X86IntrinsicType { + fn from_c(s: &str) -> Result { + let mut s_copy = s.to_string(); + s_copy = s_copy + .replace("*", "") + .replace("_", "") + .replace("constexpr", "") + .replace("const", "") + .replace("literal", ""); + + let s_split = s_copy + .split(" ") + .filter_map(|s| if s.len() == 0 { None } else { Some(s) }) + .last(); + + let s_split = s_split.map(|s| s.chars().filter(|c| !c.is_numeric()).join("")); + + // TODO: make the unwrapping safe + let kind = TypeKind::from_str(s_split.unwrap().trim()).unwrap_or(TypeKind::Void); + + let kind = if s.find("unsigned").is_some() { + match kind { + TypeKind::Int(_) => TypeKind::Int(Sign::Unsigned), + TypeKind::Char(_) => TypeKind::Char(Sign::Unsigned), + a => a, + } + } else { + kind + }; + + let ptr_constant = false; + let constant = s.matches("const").next().is_some(); + let ptr = s.matches("*").next().is_some(); + + Ok(IntrinsicType { + ptr, + ptr_constant, + constant, + kind, + bit_len: None, + simd_len: None, + vec_len: None, + }) + } + + pub fn update_simd_len(&mut self) { + let mut type_processed = self.param.type_data.clone(); + type_processed.retain(|c| c.is_numeric()); + + // check the param.type and extract numeric part if there are double + // underscores. divide this number with bit-len and set this as simd-len. + // Only __m types can have a simd-len. + if self.param.type_data.contains("__m") && !self.param.type_data.contains("__mmask") { + self.data.simd_len = match str::parse::(type_processed.as_str()) { + // If bit_len is None, simd_len will be None. + // Else simd_len will be (num_bits / bit_len). + Ok(num_bits) => self + .data + .bit_len + .and_then(|bit_len| Some(num_bits / bit_len)), + Err(_) => None, + }; + } + } + + pub fn from_param(param: &Parameter) -> Result { + match Self::from_c(param.type_data.as_str()) { + Err(message) => Err(message), + Ok(mut data) => { + // First correct the type of the parameter using param.etype. + // The assumption is that the parameter of type void may have param.type + // as "__m128i", "__mmask8" and the like. + if !param.etype.is_empty() { + match TypeKind::from_str(param.etype.as_str()) { + Ok(value) => { + data.kind = value; + } + Err(_) => {} + }; + } + + // check for param.etype. + // extract the numeric part and set as bit-len + // If param.etype is not present, guess the default bit-len + + let mut etype_processed = param.etype.clone(); + etype_processed.retain(|c| c.is_numeric()); + + let mut type_processed = param.type_data.clone(); + type_processed.retain(|c| c.is_numeric()); + + match str::parse::(etype_processed.as_str()) { + Ok(value) => data.bit_len = Some(value), + Err(_) => { + data.bit_len = match data.kind() { + TypeKind::Char(_) => Some(8), + TypeKind::BFloat => Some(16), + TypeKind::Int(_) => Some(32), + TypeKind::Float => Some(32), + _ => None, + }; + } + } + + if param.type_data.contains("__mmask") { + data.bit_len = str::parse::(type_processed.as_str()).ok(); + } + + if vec!["M512", "M256", "M128"].contains(¶m.etype.as_str()) { + match param.type_data.chars().last() { + Some('i') => { + data.kind = TypeKind::Int(Sign::Signed); + data.bit_len = Some(32); + } + Some('h') => { + data.kind = TypeKind::Float; + data.bit_len = Some(16); + } + Some('d') => { + data.kind = TypeKind::Float; + data.bit_len = Some(64); + } + _ => (), + } + } + + // default settings for "void *" parameters + // often used by intrinsics to denote memory address or so. + if data.kind == TypeKind::Void && data.ptr { + data.kind = TypeKind::Int(Sign::Unsigned); + data.bit_len = Some(8); + } + + // default settings for "void *" parameters + // often used by intrinsics to denote memory address or so. + if data.kind == TypeKind::Mask && data.bit_len.is_none() { + data.bit_len = Some(32); + } + + if param.etype == "IMM" || param.imm_width > 0 || param.imm_type.len() > 0 { + data.kind = TypeKind::Int(Sign::Unsigned); + data.constant = true; + } + + // Rust defaults to signed variants, unless they are explicitly mentioned + // the `type` field are C++ types. + if data.kind == TypeKind::Int(Sign::Unsigned) + && !(param.type_data.contains("unsigned") || param.type_data.contains("uint")) + { + data.kind = TypeKind::Int(Sign::Signed) + } + + // default settings for IMM parameters + if param.etype == "IMM" { + data.bit_len = if param.imm_width > 0 { + Some(param.imm_width) + } else { + Some(8) + } + } + + let mut result = X86IntrinsicType { + data, + param: param.clone(), + }; + + result.update_simd_len(); + Ok(result) + } + } + // Tile types won't currently reach here, since the intrinsic that involve them + // often return "null" type. Such intrinsics are not tested in `intrinsic-test` + // currently and are filtered out at `mod.rs`. + } +} diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs new file mode 100644 index 0000000000..af85118b8a --- /dev/null +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -0,0 +1,139 @@ +use crate::common::argument::{Argument, ArgumentList}; +use crate::common::intrinsic::Intrinsic; +use crate::common::intrinsic_helpers::TypeKind; +use crate::x86::constraint::map_constraints; + +use regex::Regex; +use serde::{Deserialize, Deserializer}; +use std::path::Path; + +use super::intrinsic::X86IntrinsicType; + +// Custom deserializer function to convert strings to u32 +fn string_to_u32<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + return s.as_str().parse::().or(Ok(0u32)); +} + +#[derive(Deserialize)] +struct Data { + #[serde(rename = "intrinsic", default)] + intrinsics: Vec, +} + +#[derive(Deserialize)] +struct XMLIntrinsic { + #[serde(rename = "return")] + pub return_data: Parameter, + #[serde(rename = "@name")] + pub name: String, + // #[serde(rename = "@tech")] + // tech: String, + #[serde(rename = "CPUID", default)] + cpuid: Vec, + #[serde(rename = "parameter", default)] + parameters: Vec, +} + +#[derive(Debug, PartialEq, Clone, Deserialize)] +pub struct Parameter { + #[serde(rename = "@varname", default)] + pub var_name: String, + #[serde(rename = "@type", default)] + pub type_data: String, + #[serde(rename = "@etype", default)] + pub etype: String, + #[serde(rename = "@memwidth", default, deserialize_with = "string_to_u32")] + pub memwidth: u32, + #[serde(rename = "@immwidth", default, deserialize_with = "string_to_u32")] + pub imm_width: u32, + #[serde(rename = "@immtype", default)] + pub imm_type: String, +} + +pub fn get_xml_intrinsics( + filename: &Path, +) -> Result>, Box> { + let file = std::fs::File::open(filename)?; + let reader = std::io::BufReader::new(file); + let data: Data = + quick_xml::de::from_reader(reader).expect("failed to deserialize the source XML file"); + + let parsed_intrinsics: Vec> = data + .intrinsics + .into_iter() + .filter_map(|intr| { + // Some(xml_to_intrinsic(intr, target).expect("Couldn't parse XML properly!")) + xml_to_intrinsic(intr).ok() + }) + .collect(); + + Ok(parsed_intrinsics) +} + +fn xml_to_intrinsic( + intr: XMLIntrinsic, +) -> Result, Box> { + let name = intr.name; + let result = X86IntrinsicType::from_param(&intr.return_data); + let args_check = intr.parameters.into_iter().enumerate().map(|(i, param)| { + let ty = X86IntrinsicType::from_param(¶m); + if ty.is_err() { + None + } else { + let effective_imm_width = if name == "_mm_mpsadbw_epu8" && param.var_name == "imm8" { + 3 + } else { + param.imm_width + }; + let constraint = map_constraints(¶m.imm_type, effective_imm_width); + let arg = Argument::::new( + i, + param.var_name.clone(), + ty.unwrap(), + constraint, + ); + Some(arg) + } + }); + + let args = args_check.collect::>(); + if args.iter().any(|elem| elem.is_none()) { + return Err(Box::from("intrinsic isn't fully supported in this test!")); + } + let mut args = args + .into_iter() + .map(|e| e.unwrap()) + .filter(|arg| arg.ty.ptr || arg.ty.kind != TypeKind::Void) + .collect::>(); + + let mut args_test = args.iter(); + + // if one of the args has etype="MASK" and type="__md", + // then set the bit_len and simd_len accordingly + let re = Regex::new(r"__m\d+").unwrap(); + let is_mask = |arg: &Argument| arg.ty.param.etype.as_str() == "MASK"; + let is_vector = |arg: &Argument| re.is_match(arg.ty.param.type_data.as_str()); + let pos = args_test.position(|arg| is_mask(arg) && is_vector(arg)); + if let Some(index) = pos { + args[index].ty.bit_len = args[0].ty.bit_len; + } + + args.iter_mut().for_each(|arg| arg.ty.update_simd_len()); + + let arguments = ArgumentList:: { args }; + + if let Err(message) = result { + return Err(Box::from(message)); + } + + Ok(Intrinsic { + name, + arguments, + results: result.unwrap(), + arch_tags: intr.cpuid, + }) +} diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 02b6bdc768..5a98db980b 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -164,7 +164,7 @@ fn verify_all_signatures() { // Open up the network console and you'll see an xml file was downloaded // (currently called data-3.6.9.xml). That's the file we downloaded // here. - let xml = include_bytes!("../x86-intel.xml"); + let xml = include_bytes!("../../../intrinsics_data/x86-intel.xml"); let xml = &xml[..]; let data: Data = quick_xml::de::from_reader(xml).expect("failed to deserialize xml"); diff --git a/crates/stdarch-verify/x86-intel.xml b/intrinsics_data/x86-intel.xml similarity index 100% rename from crates/stdarch-verify/x86-intel.xml rename to intrinsics_data/x86-intel.xml